Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

+56

Documentation/virtual/kvm/api.txt

··· 3561 3561 -ENOENT on deassign if the conn_id isn't registered 3562 3562 -EEXIST on assign if the conn_id is already registered 3563 3563 3564 + 4.114 KVM_GET_NESTED_STATE 3565 + 3566 + Capability: KVM_CAP_NESTED_STATE 3567 + Architectures: x86 3568 + Type: vcpu ioctl 3569 + Parameters: struct kvm_nested_state (in/out) 3570 + Returns: 0 on success, -1 on error 3571 + Errors: 3572 + E2BIG: the total state size (including the fixed-size part of struct 3573 + kvm_nested_state) exceeds the value of 'size' specified by 3574 + the user; the size required will be written into size. 3575 + 3576 + struct kvm_nested_state { 3577 + __u16 flags; 3578 + __u16 format; 3579 + __u32 size; 3580 + union { 3581 + struct kvm_vmx_nested_state vmx; 3582 + struct kvm_svm_nested_state svm; 3583 + __u8 pad[120]; 3584 + }; 3585 + __u8 data[0]; 3586 + }; 3587 + 3588 + #define KVM_STATE_NESTED_GUEST_MODE 0x00000001 3589 + #define KVM_STATE_NESTED_RUN_PENDING 0x00000002 3590 + 3591 + #define KVM_STATE_NESTED_SMM_GUEST_MODE 0x00000001 3592 + #define KVM_STATE_NESTED_SMM_VMXON 0x00000002 3593 + 3594 + struct kvm_vmx_nested_state { 3595 + __u64 vmxon_pa; 3596 + __u64 vmcs_pa; 3597 + 3598 + struct { 3599 + __u16 flags; 3600 + } smm; 3601 + }; 3602 + 3603 + This ioctl copies the vcpu's nested virtualization state from the kernel to 3604 + userspace. 3605 + 3606 + The maximum size of the state, including the fixed-size part of struct 3607 + kvm_nested_state, can be retrieved by passing KVM_CAP_NESTED_STATE to 3608 + the KVM_CHECK_EXTENSION ioctl(). 3609 + 3610 + 4.115 KVM_SET_NESTED_STATE 3611 + 3612 + Capability: KVM_CAP_NESTED_STATE 3613 + Architectures: x86 3614 + Type: vcpu ioctl 3615 + Parameters: struct kvm_nested_state (in) 3616 + Returns: 0 on success, -1 on error 3617 + 3618 + This copies the vcpu's kvm_nested_state struct from userspace to the kernel. For 3619 + the definition of struct kvm_nested_state, see KVM_GET_NESTED_STATE. 3564 3620 3565 3621 5. The kvm_run structure 3566 3622 ------------------------

+4

Documentation/virtual/kvm/cpuid.txt

··· 62 62 || || can be enabled by setting bit 2 63 63 || || when writing to msr 0x4b564d02 64 64 ------------------------------------------------------------------------------ 65 + KVM_FEATURE_PV_SEND_IPI || 11 || guest checks this feature bit 66 + || || before using paravirtualized 67 + || || send IPIs. 68 + ------------------------------------------------------------------------------ 65 69 KVM_FEATURE_CLOCKSOURCE_STABLE_BIT || 24 || host will warn if no guest-side 66 70 || || per-cpu warps are expected in 67 71 || || kvmclock.

+20

Documentation/virtual/kvm/hypercalls.txt

··· 121 121 122 122 Returns KVM_EOPNOTSUPP if the host does not use TSC clocksource, 123 123 or if clock type is different than KVM_CLOCK_PAIRING_WALLCLOCK. 124 + 125 + 6. KVM_HC_SEND_IPI 126 + ------------------------ 127 + Architecture: x86 128 + Status: active 129 + Purpose: Send IPIs to multiple vCPUs. 130 + 131 + a0: lower part of the bitmap of destination APIC IDs 132 + a1: higher part of the bitmap of destination APIC IDs 133 + a2: the lowest APIC ID in bitmap 134 + a3: APIC ICR 135 + 136 + The hypercall lets a guest send multicast IPIs, with at most 128 137 + 128 destinations per hypercall in 64-bit mode and 64 vCPUs per 138 + hypercall in 32-bit mode. The destinations are represented by a 139 + bitmap contained in the first two arguments (a0 and a1). Bit 0 of 140 + a0 corresponds to the APIC ID in the third argument (a2), bit 1 141 + corresponds to the APIC ID a2+1, and so on. 142 + 143 + Returns the number of CPUs to which the IPIs were delivered successfully.

+47

arch/powerpc/include/asm/kvm_book3s.h

··· 390 390 #define SPLIT_HACK_MASK 0xff000000 391 391 #define SPLIT_HACK_OFFS 0xfb000000 392 392 393 + /* 394 + * This packs a VCPU ID from the [0..KVM_MAX_VCPU_ID) space down to the 395 + * [0..KVM_MAX_VCPUS) space, using knowledge of the guest's core stride 396 + * (but not its actual threading mode, which is not available) to avoid 397 + * collisions. 398 + * 399 + * The implementation leaves VCPU IDs from the range [0..KVM_MAX_VCPUS) (block 400 + * 0) unchanged: if the guest is filling each VCORE completely then it will be 401 + * using consecutive IDs and it will fill the space without any packing. 402 + * 403 + * For higher VCPU IDs, the packed ID is based on the VCPU ID modulo 404 + * KVM_MAX_VCPUS (effectively masking off the top bits) and then an offset is 405 + * added to avoid collisions. 406 + * 407 + * VCPU IDs in the range [KVM_MAX_VCPUS..(KVM_MAX_VCPUS*2)) (block 1) are only 408 + * possible if the guest is leaving at least 1/2 of each VCORE empty, so IDs 409 + * can be safely packed into the second half of each VCORE by adding an offset 410 + * of (stride / 2). 411 + * 412 + * Similarly, if VCPU IDs in the range [(KVM_MAX_VCPUS*2)..(KVM_MAX_VCPUS*4)) 413 + * (blocks 2 and 3) are seen, the guest must be leaving at least 3/4 of each 414 + * VCORE empty so packed IDs can be offset by (stride / 4) and (stride * 3 / 4). 415 + * 416 + * Finally, VCPU IDs from blocks 5..7 will only be seen if the guest is using a 417 + * stride of 8 and 1 thread per core so the remaining offsets of 1, 5, 3 and 7 418 + * must be free to use. 419 + * 420 + * (The offsets for each block are stored in block_offsets[], indexed by the 421 + * block number if the stride is 8. For cases where the guest's stride is less 422 + * than 8, we can re-use the block_offsets array by multiplying the block 423 + * number by (MAX_SMT_THREADS / stride) to reach the correct entry.) 424 + */ 425 + static inline u32 kvmppc_pack_vcpu_id(struct kvm *kvm, u32 id) 426 + { 427 + const int block_offsets[MAX_SMT_THREADS] = {0, 4, 2, 6, 1, 5, 3, 7}; 428 + int stride = kvm->arch.emul_smt_mode; 429 + int block = (id / KVM_MAX_VCPUS) * (MAX_SMT_THREADS / stride); 430 + u32 packed_id; 431 + 432 + if (WARN_ONCE(block >= MAX_SMT_THREADS, "VCPU ID too large to pack")) 433 + return 0; 434 + packed_id = (id % KVM_MAX_VCPUS) + block_offsets[block]; 435 + if (WARN_ONCE(packed_id >= KVM_MAX_VCPUS, "VCPU ID packing failed")) 436 + return 0; 437 + return packed_id; 438 + } 439 + 393 440 #endif /* __ASM_KVM_BOOK3S_H__ */

+16 -10

arch/powerpc/include/asm/kvm_host.h

··· 42 42 #define KVM_USER_MEM_SLOTS 512 43 43 44 44 #include <asm/cputhreads.h> 45 - #define KVM_MAX_VCPU_ID (threads_per_subcore * KVM_MAX_VCORES) 45 + 46 + #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 47 + #include <asm/kvm_book3s_asm.h> /* for MAX_SMT_THREADS */ 48 + #define KVM_MAX_VCPU_ID (MAX_SMT_THREADS * KVM_MAX_VCORES) 49 + 50 + #else 51 + #define KVM_MAX_VCPU_ID KVM_MAX_VCPUS 52 + #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ 46 53 47 54 #define __KVM_HAVE_ARCH_INTC_INITIALIZED 48 55 ··· 679 672 gva_t vaddr_accessed; 680 673 pgd_t *pgdir; 681 674 682 - u8 io_gpr; /* GPR used as IO source/target */ 675 + u16 io_gpr; /* GPR used as IO source/target */ 683 676 u8 mmio_host_swabbed; 684 677 u8 mmio_sign_extend; 685 678 /* conversion between single and double precision */ ··· 695 688 */ 696 689 u8 mmio_vsx_copy_nums; 697 690 u8 mmio_vsx_offset; 698 - u8 mmio_vsx_tx_sx_enabled; 699 691 u8 mmio_vmx_copy_nums; 700 692 u8 mmio_vmx_offset; 701 693 u8 mmio_copy_type; ··· 807 801 #define KVMPPC_VCPU_BUSY_IN_HOST 2 808 802 809 803 /* Values for vcpu->arch.io_gpr */ 810 - #define KVM_MMIO_REG_MASK 0x001f 811 - #define KVM_MMIO_REG_EXT_MASK 0xffe0 804 + #define KVM_MMIO_REG_MASK 0x003f 805 + #define KVM_MMIO_REG_EXT_MASK 0xffc0 812 806 #define KVM_MMIO_REG_GPR 0x0000 813 - #define KVM_MMIO_REG_FPR 0x0020 814 - #define KVM_MMIO_REG_QPR 0x0040 815 - #define KVM_MMIO_REG_FQPR 0x0060 816 - #define KVM_MMIO_REG_VSX 0x0080 817 - #define KVM_MMIO_REG_VMX 0x00c0 807 + #define KVM_MMIO_REG_FPR 0x0040 808 + #define KVM_MMIO_REG_QPR 0x0080 809 + #define KVM_MMIO_REG_FQPR 0x00c0 810 + #define KVM_MMIO_REG_VSX 0x0100 811 + #define KVM_MMIO_REG_VMX 0x0180 818 812 819 813 #define __KVM_HAVE_ARCH_WQP 820 814 #define __KVM_HAVE_CREATE_DEVICE

+1 -1

arch/powerpc/include/asm/reg.h

··· 163 163 #define PSSCR_ESL 0x00200000 /* Enable State Loss */ 164 164 #define PSSCR_SD 0x00400000 /* Status Disable */ 165 165 #define PSSCR_PLS 0xf000000000000000 /* Power-saving Level Status */ 166 - #define PSSCR_GUEST_VIS 0xf0000000000003ff /* Guest-visible PSSCR fields */ 166 + #define PSSCR_GUEST_VIS 0xf0000000000003ffUL /* Guest-visible PSSCR fields */ 167 167 #define PSSCR_FAKE_SUSPEND 0x00000400 /* Fake-suspend bit (P9 DD2.2) */ 168 168 #define PSSCR_FAKE_SUSPEND_LG 10 /* Fake-suspend bit position */ 169 169

+2 -3

arch/powerpc/kvm/book3s_64_vio.c

··· 179 179 if ((tbltmp->it_page_shift <= stt->page_shift) && 180 180 (tbltmp->it_offset << tbltmp->it_page_shift == 181 181 stt->offset << stt->page_shift) && 182 - (tbltmp->it_size << tbltmp->it_page_shift == 182 + (tbltmp->it_size << tbltmp->it_page_shift >= 183 183 stt->size << stt->page_shift)) { 184 184 /* 185 185 * Reference the table to avoid races with ··· 295 295 { 296 296 struct kvmppc_spapr_tce_table *stt = NULL; 297 297 struct kvmppc_spapr_tce_table *siter; 298 - unsigned long npages, size; 298 + unsigned long npages, size = args->size; 299 299 int ret = -ENOMEM; 300 300 int i; 301 301 ··· 303 303 (args->offset + args->size > (ULLONG_MAX >> args->page_shift))) 304 304 return -EINVAL; 305 305 306 - size = _ALIGN_UP(args->size, PAGE_SIZE >> 3); 307 306 npages = kvmppc_tce_pages(size); 308 307 ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true); 309 308 if (ret)

+29 -13

arch/powerpc/kvm/book3s_hv.c

··· 127 127 * and SPURR count and should be set according to the number of 128 128 * online threads in the vcore being run. 129 129 */ 130 - #define RWMR_RPA_P8_1THREAD 0x164520C62609AECA 131 - #define RWMR_RPA_P8_2THREAD 0x7FFF2908450D8DA9 132 - #define RWMR_RPA_P8_3THREAD 0x164520C62609AECA 133 - #define RWMR_RPA_P8_4THREAD 0x199A421245058DA9 134 - #define RWMR_RPA_P8_5THREAD 0x164520C62609AECA 135 - #define RWMR_RPA_P8_6THREAD 0x164520C62609AECA 136 - #define RWMR_RPA_P8_7THREAD 0x164520C62609AECA 137 - #define RWMR_RPA_P8_8THREAD 0x164520C62609AECA 130 + #define RWMR_RPA_P8_1THREAD 0x164520C62609AECAUL 131 + #define RWMR_RPA_P8_2THREAD 0x7FFF2908450D8DA9UL 132 + #define RWMR_RPA_P8_3THREAD 0x164520C62609AECAUL 133 + #define RWMR_RPA_P8_4THREAD 0x199A421245058DA9UL 134 + #define RWMR_RPA_P8_5THREAD 0x164520C62609AECAUL 135 + #define RWMR_RPA_P8_6THREAD 0x164520C62609AECAUL 136 + #define RWMR_RPA_P8_7THREAD 0x164520C62609AECAUL 137 + #define RWMR_RPA_P8_8THREAD 0x164520C62609AECAUL 138 138 139 139 static unsigned long p8_rwmr_values[MAX_SMT_THREADS + 1] = { 140 140 RWMR_RPA_P8_1THREAD, ··· 1807 1807 return threads_per_subcore; 1808 1808 } 1809 1809 1810 - static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core) 1810 + static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int id) 1811 1811 { 1812 1812 struct kvmppc_vcore *vcore; 1813 1813 ··· 1821 1821 init_swait_queue_head(&vcore->wq); 1822 1822 vcore->preempt_tb = TB_NIL; 1823 1823 vcore->lpcr = kvm->arch.lpcr; 1824 - vcore->first_vcpuid = core * kvm->arch.smt_mode; 1824 + vcore->first_vcpuid = id; 1825 1825 vcore->kvm = kvm; 1826 1826 INIT_LIST_HEAD(&vcore->preempt_list); 1827 1827 ··· 2037 2037 mutex_lock(&kvm->lock); 2038 2038 vcore = NULL; 2039 2039 err = -EINVAL; 2040 - core = id / kvm->arch.smt_mode; 2040 + if (cpu_has_feature(CPU_FTR_ARCH_300)) { 2041 + if (id >= (KVM_MAX_VCPUS * kvm->arch.emul_smt_mode)) { 2042 + pr_devel("KVM: VCPU ID too high\n"); 2043 + core = KVM_MAX_VCORES; 2044 + } else { 2045 + BUG_ON(kvm->arch.smt_mode != 1); 2046 + core = kvmppc_pack_vcpu_id(kvm, id); 2047 + } 2048 + } else { 2049 + core = id / kvm->arch.smt_mode; 2050 + } 2041 2051 if (core < KVM_MAX_VCORES) { 2042 2052 vcore = kvm->arch.vcores[core]; 2043 - if (!vcore) { 2053 + if (vcore && cpu_has_feature(CPU_FTR_ARCH_300)) { 2054 + pr_devel("KVM: collision on id %u", id); 2055 + vcore = NULL; 2056 + } else if (!vcore) { 2044 2057 err = -ENOMEM; 2045 - vcore = kvmppc_vcore_create(kvm, core); 2058 + vcore = kvmppc_vcore_create(kvm, 2059 + id & ~(kvm->arch.smt_mode - 1)); 2046 2060 kvm->arch.vcores[core] = vcore; 2047 2061 kvm->arch.online_vcores++; 2048 2062 } ··· 4564 4550 pr_err("KVM-HV: Cannot determine method for accessing XICS\n"); 4565 4551 return -ENODEV; 4566 4552 } 4553 + /* presence of intc confirmed - node can be dropped again */ 4554 + of_node_put(np); 4567 4555 } 4568 4556 #endif 4569 4557

+12 -7

arch/powerpc/kvm/book3s_xive.c

··· 317 317 return -EBUSY; 318 318 } 319 319 320 + static u32 xive_vp(struct kvmppc_xive *xive, u32 server) 321 + { 322 + return xive->vp_base + kvmppc_pack_vcpu_id(xive->kvm, server); 323 + } 324 + 320 325 static u8 xive_lock_and_mask(struct kvmppc_xive *xive, 321 326 struct kvmppc_xive_src_block *sb, 322 327 struct kvmppc_xive_irq_state *state) ··· 367 362 */ 368 363 if (xd->flags & OPAL_XIVE_IRQ_MASK_VIA_FW) { 369 364 xive_native_configure_irq(hw_num, 370 - xive->vp_base + state->act_server, 365 + xive_vp(xive, state->act_server), 371 366 MASKED, state->number); 372 367 /* set old_p so we can track if an H_EOI was done */ 373 368 state->old_p = true; ··· 423 418 */ 424 419 if (xd->flags & OPAL_XIVE_IRQ_MASK_VIA_FW) { 425 420 xive_native_configure_irq(hw_num, 426 - xive->vp_base + state->act_server, 421 + xive_vp(xive, state->act_server), 427 422 state->act_priority, state->number); 428 423 /* If an EOI is needed, do it here */ 429 424 if (!state->old_p) ··· 500 495 kvmppc_xive_select_irq(state, &hw_num, NULL); 501 496 502 497 return xive_native_configure_irq(hw_num, 503 - xive->vp_base + server, 498 + xive_vp(xive, server), 504 499 prio, state->number); 505 500 } 506 501 ··· 888 883 * which is fine for a never started interrupt. 889 884 */ 890 885 xive_native_configure_irq(hw_irq, 891 - xive->vp_base + state->act_server, 886 + xive_vp(xive, state->act_server), 892 887 state->act_priority, state->number); 893 888 894 889 /* ··· 964 959 965 960 /* Reconfigure the IPI */ 966 961 xive_native_configure_irq(state->ipi_number, 967 - xive->vp_base + state->act_server, 962 + xive_vp(xive, state->act_server), 968 963 state->act_priority, state->number); 969 964 970 965 /* ··· 1089 1084 pr_devel("Duplicate !\n"); 1090 1085 return -EEXIST; 1091 1086 } 1092 - if (cpu >= KVM_MAX_VCPUS) { 1087 + if (cpu >= (KVM_MAX_VCPUS * vcpu->kvm->arch.emul_smt_mode)) { 1093 1088 pr_devel("Out of bounds !\n"); 1094 1089 return -EINVAL; 1095 1090 } ··· 1103 1098 xc->xive = xive; 1104 1099 xc->vcpu = vcpu; 1105 1100 xc->server_num = cpu; 1106 - xc->vp_id = xive->vp_base + cpu; 1101 + xc->vp_id = xive_vp(xive, cpu); 1107 1102 xc->mfrr = 0xff; 1108 1103 xc->valid = true; 1109 1104

+3 -4

arch/powerpc/kvm/emulate_loadstore.c

··· 106 106 * if mmio_vsx_tx_sx_enabled == 1, copy data between 107 107 * VSR[32..63] and memory 108 108 */ 109 - vcpu->arch.mmio_vsx_tx_sx_enabled = get_tx_or_sx(inst); 110 109 vcpu->arch.mmio_vsx_copy_nums = 0; 111 110 vcpu->arch.mmio_vsx_offset = 0; 112 111 vcpu->arch.mmio_copy_type = KVMPPC_VSX_COPY_NONE; ··· 241 242 } 242 243 243 244 emulated = kvmppc_handle_vsx_load(run, vcpu, 244 - KVM_MMIO_REG_VSX | (op.reg & 0x1f), 245 - io_size_each, 1, op.type & SIGNEXT); 245 + KVM_MMIO_REG_VSX|op.reg, io_size_each, 246 + 1, op.type & SIGNEXT); 246 247 break; 247 248 } 248 249 #endif ··· 362 363 } 363 364 364 365 emulated = kvmppc_handle_vsx_store(run, vcpu, 365 - op.reg & 0x1f, io_size_each, 1); 366 + op.reg, io_size_each, 1); 366 367 break; 367 368 } 368 369 #endif

+15 -15

arch/powerpc/kvm/powerpc.c

··· 879 879 if (offset == -1) 880 880 return; 881 881 882 - if (vcpu->arch.mmio_vsx_tx_sx_enabled) { 883 - val.vval = VCPU_VSX_VR(vcpu, index); 882 + if (index >= 32) { 883 + val.vval = VCPU_VSX_VR(vcpu, index - 32); 884 884 val.vsxval[offset] = gpr; 885 - VCPU_VSX_VR(vcpu, index) = val.vval; 885 + VCPU_VSX_VR(vcpu, index - 32) = val.vval; 886 886 } else { 887 887 VCPU_VSX_FPR(vcpu, index, offset) = gpr; 888 888 } ··· 894 894 union kvmppc_one_reg val; 895 895 int index = vcpu->arch.io_gpr & KVM_MMIO_REG_MASK; 896 896 897 - if (vcpu->arch.mmio_vsx_tx_sx_enabled) { 898 - val.vval = VCPU_VSX_VR(vcpu, index); 897 + if (index >= 32) { 898 + val.vval = VCPU_VSX_VR(vcpu, index - 32); 899 899 val.vsxval[0] = gpr; 900 900 val.vsxval[1] = gpr; 901 - VCPU_VSX_VR(vcpu, index) = val.vval; 901 + VCPU_VSX_VR(vcpu, index - 32) = val.vval; 902 902 } else { 903 903 VCPU_VSX_FPR(vcpu, index, 0) = gpr; 904 904 VCPU_VSX_FPR(vcpu, index, 1) = gpr; ··· 911 911 union kvmppc_one_reg val; 912 912 int index = vcpu->arch.io_gpr & KVM_MMIO_REG_MASK; 913 913 914 - if (vcpu->arch.mmio_vsx_tx_sx_enabled) { 914 + if (index >= 32) { 915 915 val.vsx32val[0] = gpr; 916 916 val.vsx32val[1] = gpr; 917 917 val.vsx32val[2] = gpr; 918 918 val.vsx32val[3] = gpr; 919 - VCPU_VSX_VR(vcpu, index) = val.vval; 919 + VCPU_VSX_VR(vcpu, index - 32) = val.vval; 920 920 } else { 921 921 val.vsx32val[0] = gpr; 922 922 val.vsx32val[1] = gpr; ··· 936 936 if (offset == -1) 937 937 return; 938 938 939 - if (vcpu->arch.mmio_vsx_tx_sx_enabled) { 940 - val.vval = VCPU_VSX_VR(vcpu, index); 939 + if (index >= 32) { 940 + val.vval = VCPU_VSX_VR(vcpu, index - 32); 941 941 val.vsx32val[offset] = gpr32; 942 - VCPU_VSX_VR(vcpu, index) = val.vval; 942 + VCPU_VSX_VR(vcpu, index - 32) = val.vval; 943 943 } else { 944 944 dword_offset = offset / 2; 945 945 word_offset = offset % 2; ··· 1360 1360 break; 1361 1361 } 1362 1362 1363 - if (!vcpu->arch.mmio_vsx_tx_sx_enabled) { 1363 + if (rs < 32) { 1364 1364 *val = VCPU_VSX_FPR(vcpu, rs, vsx_offset); 1365 1365 } else { 1366 - reg.vval = VCPU_VSX_VR(vcpu, rs); 1366 + reg.vval = VCPU_VSX_VR(vcpu, rs - 32); 1367 1367 *val = reg.vsxval[vsx_offset]; 1368 1368 } 1369 1369 break; ··· 1377 1377 break; 1378 1378 } 1379 1379 1380 - if (!vcpu->arch.mmio_vsx_tx_sx_enabled) { 1380 + if (rs < 32) { 1381 1381 dword_offset = vsx_offset / 2; 1382 1382 word_offset = vsx_offset % 2; 1383 1383 reg.vsxval[0] = VCPU_VSX_FPR(vcpu, rs, dword_offset); 1384 1384 *val = reg.vsx32val[word_offset]; 1385 1385 } else { 1386 - reg.vval = VCPU_VSX_VR(vcpu, rs); 1386 + reg.vval = VCPU_VSX_VR(vcpu, rs - 32); 1387 1387 *val = reg.vsx32val[vsx_offset]; 1388 1388 } 1389 1389 break;

+4 -7

arch/s390/include/asm/kvm_host.h

··· 269 269 __u8 reserved1c0[8]; /* 0x01c0 */ 270 270 #define ECD_HOSTREGMGMT 0x20000000 271 271 #define ECD_MEF 0x08000000 272 + #define ECD_ETOKENF 0x02000000 272 273 __u32 ecd; /* 0x01c8 */ 273 274 __u8 reserved1cc[18]; /* 0x01cc */ 274 275 __u64 pp; /* 0x01de */ ··· 656 655 seqcount_t cputm_seqcount; 657 656 __u64 cputm_start; 658 657 bool gs_enabled; 658 + bool skey_enabled; 659 659 }; 660 660 661 661 struct kvm_vm_stat { ··· 795 793 struct page *pages[KVM_MAX_VCPUS]; 796 794 }; 797 795 798 - struct kvm_s390_migration_state { 799 - unsigned long bitmap_size; /* in bits (number of guest pages) */ 800 - atomic64_t dirty_pages; /* number of dirty pages */ 801 - unsigned long *pgste_bitmap; 802 - }; 803 - 804 796 struct kvm_arch{ 805 797 void *sca; 806 798 int use_esca; ··· 824 828 struct kvm_s390_vsie vsie; 825 829 u8 epdx; 826 830 u64 epoch; 827 - struct kvm_s390_migration_state *migration_state; 831 + int migration_mode; 832 + atomic64_t cmma_dirty_pages; 828 833 /* subset of available cpu features enabled by user space */ 829 834 DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); 830 835 struct kvm_s390_gisa *gisa;

+4 -1

arch/s390/include/uapi/asm/kvm.h

··· 4 4 /* 5 5 * KVM s390 specific structures and definitions 6 6 * 7 - * Copyright IBM Corp. 2008 7 + * Copyright IBM Corp. 2008, 2018 8 8 * 9 9 * Author(s): Carsten Otte <cotte@de.ibm.com> 10 10 * Christian Borntraeger <borntraeger@de.ibm.com> ··· 225 225 #define KVM_SYNC_FPRS (1UL << 8) 226 226 #define KVM_SYNC_GSCB (1UL << 9) 227 227 #define KVM_SYNC_BPBC (1UL << 10) 228 + #define KVM_SYNC_ETOKEN (1UL << 11) 228 229 /* length and alignment of the sdnx as a power of two */ 229 230 #define SDNXC 8 230 231 #define SDNXL (1UL << SDNXC) ··· 259 258 struct { 260 259 __u64 reserved1[2]; 261 260 __u64 gscb[4]; 261 + __u64 etoken; 262 + __u64 etoken_extension; 262 263 }; 263 264 }; 264 265 };

+182 -123

arch/s390/kvm/kvm-s390.c

··· 906 906 */ 907 907 static int kvm_s390_vm_start_migration(struct kvm *kvm) 908 908 { 909 - struct kvm_s390_migration_state *mgs; 910 909 struct kvm_memory_slot *ms; 911 - /* should be the only one */ 912 910 struct kvm_memslots *slots; 913 - unsigned long ram_pages; 911 + unsigned long ram_pages = 0; 914 912 int slotnr; 915 913 916 914 /* migration mode already enabled */ 917 - if (kvm->arch.migration_state) 915 + if (kvm->arch.migration_mode) 918 916 return 0; 919 - 920 917 slots = kvm_memslots(kvm); 921 918 if (!slots || !slots->used_slots) 922 919 return -EINVAL; 923 920 924 - mgs = kzalloc(sizeof(*mgs), GFP_KERNEL); 925 - if (!mgs) 926 - return -ENOMEM; 927 - kvm->arch.migration_state = mgs; 928 - 929 - if (kvm->arch.use_cmma) { 930 - /* 931 - * Get the first slot. They are reverse sorted by base_gfn, so 932 - * the first slot is also the one at the end of the address 933 - * space. We have verified above that at least one slot is 934 - * present. 935 - */ 936 - ms = slots->memslots; 937 - /* round up so we only use full longs */ 938 - ram_pages = roundup(ms->base_gfn + ms->npages, BITS_PER_LONG); 939 - /* allocate enough bytes to store all the bits */ 940 - mgs->pgste_bitmap = vmalloc(ram_pages / 8); 941 - if (!mgs->pgste_bitmap) { 942 - kfree(mgs); 943 - kvm->arch.migration_state = NULL; 944 - return -ENOMEM; 945 - } 946 - 947 - mgs->bitmap_size = ram_pages; 948 - atomic64_set(&mgs->dirty_pages, ram_pages); 949 - /* mark all the pages in active slots as dirty */ 950 - for (slotnr = 0; slotnr < slots->used_slots; slotnr++) { 951 - ms = slots->memslots + slotnr; 952 - bitmap_set(mgs->pgste_bitmap, ms->base_gfn, ms->npages); 953 - } 954 - 955 - kvm_s390_sync_request_broadcast(kvm, KVM_REQ_START_MIGRATION); 921 + if (!kvm->arch.use_cmma) { 922 + kvm->arch.migration_mode = 1; 923 + return 0; 956 924 } 925 + /* mark all the pages in active slots as dirty */ 926 + for (slotnr = 0; slotnr < slots->used_slots; slotnr++) { 927 + ms = slots->memslots + slotnr; 928 + /* 929 + * The second half of the bitmap is only used on x86, 930 + * and would be wasted otherwise, so we put it to good 931 + * use here to keep track of the state of the storage 932 + * attributes. 933 + */ 934 + memset(kvm_second_dirty_bitmap(ms), 0xff, kvm_dirty_bitmap_bytes(ms)); 935 + ram_pages += ms->npages; 936 + } 937 + atomic64_set(&kvm->arch.cmma_dirty_pages, ram_pages); 938 + kvm->arch.migration_mode = 1; 939 + kvm_s390_sync_request_broadcast(kvm, KVM_REQ_START_MIGRATION); 957 940 return 0; 958 941 } 959 942 ··· 946 963 */ 947 964 static int kvm_s390_vm_stop_migration(struct kvm *kvm) 948 965 { 949 - struct kvm_s390_migration_state *mgs; 950 - 951 966 /* migration mode already disabled */ 952 - if (!kvm->arch.migration_state) 967 + if (!kvm->arch.migration_mode) 953 968 return 0; 954 - mgs = kvm->arch.migration_state; 955 - kvm->arch.migration_state = NULL; 956 - 957 - if (kvm->arch.use_cmma) { 969 + kvm->arch.migration_mode = 0; 970 + if (kvm->arch.use_cmma) 958 971 kvm_s390_sync_request_broadcast(kvm, KVM_REQ_STOP_MIGRATION); 959 - /* We have to wait for the essa emulation to finish */ 960 - synchronize_srcu(&kvm->srcu); 961 - vfree(mgs->pgste_bitmap); 962 - } 963 - kfree(mgs); 964 972 return 0; 965 973 } 966 974 ··· 979 1005 static int kvm_s390_vm_get_migration(struct kvm *kvm, 980 1006 struct kvm_device_attr *attr) 981 1007 { 982 - u64 mig = (kvm->arch.migration_state != NULL); 1008 + u64 mig = kvm->arch.migration_mode; 983 1009 984 1010 if (attr->attr != KVM_S390_VM_MIGRATION_STATUS) 985 1011 return -ENXIO; ··· 1627 1653 #define KVM_S390_CMMA_SIZE_MAX ((u32)KVM_S390_SKEYS_MAX) 1628 1654 1629 1655 /* 1656 + * Similar to gfn_to_memslot, but returns the index of a memslot also when the 1657 + * address falls in a hole. In that case the index of one of the memslots 1658 + * bordering the hole is returned. 1659 + */ 1660 + static int gfn_to_memslot_approx(struct kvm_memslots *slots, gfn_t gfn) 1661 + { 1662 + int start = 0, end = slots->used_slots; 1663 + int slot = atomic_read(&slots->lru_slot); 1664 + struct kvm_memory_slot *memslots = slots->memslots; 1665 + 1666 + if (gfn >= memslots[slot].base_gfn && 1667 + gfn < memslots[slot].base_gfn + memslots[slot].npages) 1668 + return slot; 1669 + 1670 + while (start < end) { 1671 + slot = start + (end - start) / 2; 1672 + 1673 + if (gfn >= memslots[slot].base_gfn) 1674 + end = slot; 1675 + else 1676 + start = slot + 1; 1677 + } 1678 + 1679 + if (gfn >= memslots[start].base_gfn && 1680 + gfn < memslots[start].base_gfn + memslots[start].npages) { 1681 + atomic_set(&slots->lru_slot, start); 1682 + } 1683 + 1684 + return start; 1685 + } 1686 + 1687 + static int kvm_s390_peek_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args, 1688 + u8 *res, unsigned long bufsize) 1689 + { 1690 + unsigned long pgstev, hva, cur_gfn = args->start_gfn; 1691 + 1692 + args->count = 0; 1693 + while (args->count < bufsize) { 1694 + hva = gfn_to_hva(kvm, cur_gfn); 1695 + /* 1696 + * We return an error if the first value was invalid, but we 1697 + * return successfully if at least one value was copied. 1698 + */ 1699 + if (kvm_is_error_hva(hva)) 1700 + return args->count ? 0 : -EFAULT; 1701 + if (get_pgste(kvm->mm, hva, &pgstev) < 0) 1702 + pgstev = 0; 1703 + res[args->count++] = (pgstev >> 24) & 0x43; 1704 + cur_gfn++; 1705 + } 1706 + 1707 + return 0; 1708 + } 1709 + 1710 + static unsigned long kvm_s390_next_dirty_cmma(struct kvm_memslots *slots, 1711 + unsigned long cur_gfn) 1712 + { 1713 + int slotidx = gfn_to_memslot_approx(slots, cur_gfn); 1714 + struct kvm_memory_slot *ms = slots->memslots + slotidx; 1715 + unsigned long ofs = cur_gfn - ms->base_gfn; 1716 + 1717 + if (ms->base_gfn + ms->npages <= cur_gfn) { 1718 + slotidx--; 1719 + /* If we are above the highest slot, wrap around */ 1720 + if (slotidx < 0) 1721 + slotidx = slots->used_slots - 1; 1722 + 1723 + ms = slots->memslots + slotidx; 1724 + ofs = 0; 1725 + } 1726 + ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, ofs); 1727 + while ((slotidx > 0) && (ofs >= ms->npages)) { 1728 + slotidx--; 1729 + ms = slots->memslots + slotidx; 1730 + ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, 0); 1731 + } 1732 + return ms->base_gfn + ofs; 1733 + } 1734 + 1735 + static int kvm_s390_get_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args, 1736 + u8 *res, unsigned long bufsize) 1737 + { 1738 + unsigned long mem_end, cur_gfn, next_gfn, hva, pgstev; 1739 + struct kvm_memslots *slots = kvm_memslots(kvm); 1740 + struct kvm_memory_slot *ms; 1741 + 1742 + cur_gfn = kvm_s390_next_dirty_cmma(slots, args->start_gfn); 1743 + ms = gfn_to_memslot(kvm, cur_gfn); 1744 + args->count = 0; 1745 + args->start_gfn = cur_gfn; 1746 + if (!ms) 1747 + return 0; 1748 + next_gfn = kvm_s390_next_dirty_cmma(slots, cur_gfn + 1); 1749 + mem_end = slots->memslots[0].base_gfn + slots->memslots[0].npages; 1750 + 1751 + while (args->count < bufsize) { 1752 + hva = gfn_to_hva(kvm, cur_gfn); 1753 + if (kvm_is_error_hva(hva)) 1754 + return 0; 1755 + /* Decrement only if we actually flipped the bit to 0 */ 1756 + if (test_and_clear_bit(cur_gfn - ms->base_gfn, kvm_second_dirty_bitmap(ms))) 1757 + atomic64_dec(&kvm->arch.cmma_dirty_pages); 1758 + if (get_pgste(kvm->mm, hva, &pgstev) < 0) 1759 + pgstev = 0; 1760 + /* Save the value */ 1761 + res[args->count++] = (pgstev >> 24) & 0x43; 1762 + /* If the next bit is too far away, stop. */ 1763 + if (next_gfn > cur_gfn + KVM_S390_MAX_BIT_DISTANCE) 1764 + return 0; 1765 + /* If we reached the previous "next", find the next one */ 1766 + if (cur_gfn == next_gfn) 1767 + next_gfn = kvm_s390_next_dirty_cmma(slots, cur_gfn + 1); 1768 + /* Reached the end of memory or of the buffer, stop */ 1769 + if ((next_gfn >= mem_end) || 1770 + (next_gfn - args->start_gfn >= bufsize)) 1771 + return 0; 1772 + cur_gfn++; 1773 + /* Reached the end of the current memslot, take the next one. */ 1774 + if (cur_gfn - ms->base_gfn >= ms->npages) { 1775 + ms = gfn_to_memslot(kvm, cur_gfn); 1776 + if (!ms) 1777 + return 0; 1778 + } 1779 + } 1780 + return 0; 1781 + } 1782 + 1783 + /* 1630 1784 * This function searches for the next page with dirty CMMA attributes, and 1631 1785 * saves the attributes in the buffer up to either the end of the buffer or 1632 1786 * until a block of at least KVM_S390_MAX_BIT_DISTANCE clean bits is found; ··· 1765 1663 static int kvm_s390_get_cmma_bits(struct kvm *kvm, 1766 1664 struct kvm_s390_cmma_log *args) 1767 1665 { 1768 - struct kvm_s390_migration_state *s = kvm->arch.migration_state; 1769 - unsigned long bufsize, hva, pgstev, i, next, cur; 1770 - int srcu_idx, peek, r = 0, rr; 1771 - u8 *res; 1666 + unsigned long bufsize; 1667 + int srcu_idx, peek, ret; 1668 + u8 *values; 1772 1669 1773 - cur = args->start_gfn; 1774 - i = next = pgstev = 0; 1775 - 1776 - if (unlikely(!kvm->arch.use_cmma)) 1670 + if (!kvm->arch.use_cmma) 1777 1671 return -ENXIO; 1778 1672 /* Invalid/unsupported flags were specified */ 1779 1673 if (args->flags & ~KVM_S390_CMMA_PEEK) 1780 1674 return -EINVAL; 1781 1675 /* Migration mode query, and we are not doing a migration */ 1782 1676 peek = !!(args->flags & KVM_S390_CMMA_PEEK); 1783 - if (!peek && !s) 1677 + if (!peek && !kvm->arch.migration_mode) 1784 1678 return -EINVAL; 1785 1679 /* CMMA is disabled or was not used, or the buffer has length zero */ 1786 1680 bufsize = min(args->count, KVM_S390_CMMA_SIZE_MAX); ··· 1784 1686 memset(args, 0, sizeof(*args)); 1785 1687 return 0; 1786 1688 } 1787 - 1788 - if (!peek) { 1789 - /* We are not peeking, and there are no dirty pages */ 1790 - if (!atomic64_read(&s->dirty_pages)) { 1791 - memset(args, 0, sizeof(*args)); 1792 - return 0; 1793 - } 1794 - cur = find_next_bit(s->pgste_bitmap, s->bitmap_size, 1795 - args->start_gfn); 1796 - if (cur >= s->bitmap_size) /* nothing found, loop back */ 1797 - cur = find_next_bit(s->pgste_bitmap, s->bitmap_size, 0); 1798 - if (cur >= s->bitmap_size) { /* again! (very unlikely) */ 1799 - memset(args, 0, sizeof(*args)); 1800 - return 0; 1801 - } 1802 - next = find_next_bit(s->pgste_bitmap, s->bitmap_size, cur + 1); 1689 + /* We are not peeking, and there are no dirty pages */ 1690 + if (!peek && !atomic64_read(&kvm->arch.cmma_dirty_pages)) { 1691 + memset(args, 0, sizeof(*args)); 1692 + return 0; 1803 1693 } 1804 1694 1805 - res = vmalloc(bufsize); 1806 - if (!res) 1695 + values = vmalloc(bufsize); 1696 + if (!values) 1807 1697 return -ENOMEM; 1808 - 1809 - args->start_gfn = cur; 1810 1698 1811 1699 down_read(&kvm->mm->mmap_sem); 1812 1700 srcu_idx = srcu_read_lock(&kvm->srcu); 1813 - while (i < bufsize) { 1814 - hva = gfn_to_hva(kvm, cur); 1815 - if (kvm_is_error_hva(hva)) { 1816 - r = -EFAULT; 1817 - break; 1818 - } 1819 - /* decrement only if we actually flipped the bit to 0 */ 1820 - if (!peek && test_and_clear_bit(cur, s->pgste_bitmap)) 1821 - atomic64_dec(&s->dirty_pages); 1822 - r = get_pgste(kvm->mm, hva, &pgstev); 1823 - if (r < 0) 1824 - pgstev = 0; 1825 - /* save the value */ 1826 - res[i++] = (pgstev >> 24) & 0x43; 1827 - /* 1828 - * if the next bit is too far away, stop. 1829 - * if we reached the previous "next", find the next one 1830 - */ 1831 - if (!peek) { 1832 - if (next > cur + KVM_S390_MAX_BIT_DISTANCE) 1833 - break; 1834 - if (cur == next) 1835 - next = find_next_bit(s->pgste_bitmap, 1836 - s->bitmap_size, cur + 1); 1837 - /* reached the end of the bitmap or of the buffer, stop */ 1838 - if ((next >= s->bitmap_size) || 1839 - (next >= args->start_gfn + bufsize)) 1840 - break; 1841 - } 1842 - cur++; 1843 - } 1701 + if (peek) 1702 + ret = kvm_s390_peek_cmma(kvm, args, values, bufsize); 1703 + else 1704 + ret = kvm_s390_get_cmma(kvm, args, values, bufsize); 1844 1705 srcu_read_unlock(&kvm->srcu, srcu_idx); 1845 1706 up_read(&kvm->mm->mmap_sem); 1846 - args->count = i; 1847 - args->remaining = s ? atomic64_read(&s->dirty_pages) : 0; 1848 1707 1849 - rr = copy_to_user((void __user *)args->values, res, args->count); 1850 - if (rr) 1851 - r = -EFAULT; 1708 + if (kvm->arch.migration_mode) 1709 + args->remaining = atomic64_read(&kvm->arch.cmma_dirty_pages); 1710 + else 1711 + args->remaining = 0; 1852 1712 1853 - vfree(res); 1854 - return r; 1713 + if (copy_to_user((void __user *)args->values, values, args->count)) 1714 + ret = -EFAULT; 1715 + 1716 + vfree(values); 1717 + return ret; 1855 1718 } 1856 1719 1857 1720 /* ··· 2251 2192 kvm_s390_destroy_adapters(kvm); 2252 2193 kvm_s390_clear_float_irqs(kvm); 2253 2194 kvm_s390_vsie_destroy(kvm); 2254 - if (kvm->arch.migration_state) { 2255 - vfree(kvm->arch.migration_state->pgste_bitmap); 2256 - kfree(kvm->arch.migration_state); 2257 - } 2258 2195 KVM_EVENT(3, "vm 0x%pK destroyed", kvm); 2259 2196 } 2260 2197 ··· 2408 2353 vcpu->run->kvm_valid_regs |= KVM_SYNC_BPBC; 2409 2354 if (test_kvm_facility(vcpu->kvm, 133)) 2410 2355 vcpu->run->kvm_valid_regs |= KVM_SYNC_GSCB; 2356 + if (test_kvm_facility(vcpu->kvm, 156)) 2357 + vcpu->run->kvm_valid_regs |= KVM_SYNC_ETOKEN; 2411 2358 /* fprs can be synchronized via vrs, even if the guest has no vx. With 2412 2359 * MACHINE_HAS_VX, (load|store)_fpu_regs() will work with vrs format. 2413 2360 */ ··· 2659 2602 } 2660 2603 if (test_kvm_facility(vcpu->kvm, 139)) 2661 2604 vcpu->arch.sie_block->ecd |= ECD_MEF; 2662 - 2605 + if (test_kvm_facility(vcpu->kvm, 156)) 2606 + vcpu->arch.sie_block->ecd |= ECD_ETOKENF; 2663 2607 if (vcpu->arch.sie_block->gd) { 2664 2608 vcpu->arch.sie_block->eca |= ECA_AIV; 2665 2609 VCPU_EVENT(vcpu, 3, "AIV gisa format-%u enabled for cpu %03u", ··· 3578 3520 } 3579 3521 preempt_enable(); 3580 3522 } 3523 + /* SIE will load etoken directly from SDNX and therefore kvm_run */ 3581 3524 3582 3525 kvm_run->kvm_dirty_regs = 0; 3583 3526 } ··· 3618 3559 __ctl_clear_bit(2, 4); 3619 3560 vcpu->arch.host_gscb = NULL; 3620 3561 } 3621 - 3562 + /* SIE will save etoken directly into SDNX and therefore kvm_run */ 3622 3563 } 3623 3564 3624 3565 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)

+24 -16

arch/s390/kvm/priv.c

··· 205 205 int kvm_s390_skey_check_enable(struct kvm_vcpu *vcpu) 206 206 { 207 207 int rc; 208 - struct kvm_s390_sie_block *sie_block = vcpu->arch.sie_block; 209 208 210 209 trace_kvm_s390_skey_related_inst(vcpu); 211 210 /* Already enabled? */ 212 - if (vcpu->kvm->arch.use_skf && 213 - !(sie_block->ictl & (ICTL_ISKE | ICTL_SSKE | ICTL_RRBE)) && 214 - !kvm_s390_test_cpuflags(vcpu, CPUSTAT_KSS)) 211 + if (vcpu->arch.skey_enabled) 215 212 return 0; 216 213 217 214 rc = s390_enable_skey(); ··· 219 222 if (kvm_s390_test_cpuflags(vcpu, CPUSTAT_KSS)) 220 223 kvm_s390_clear_cpuflags(vcpu, CPUSTAT_KSS); 221 224 if (!vcpu->kvm->arch.use_skf) 222 - sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE; 225 + vcpu->arch.sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE; 223 226 else 224 - sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE); 227 + vcpu->arch.sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE); 228 + vcpu->arch.skey_enabled = true; 225 229 return 0; 226 230 } 227 231 ··· 985 987 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 986 988 987 989 if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) { 988 - if (clear_user((void __user *)vmaddr, PAGE_SIZE)) 990 + if (kvm_clear_guest(vcpu->kvm, start, PAGE_SIZE)) 989 991 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 990 992 } 991 993 ··· 1022 1024 return 0; 1023 1025 } 1024 1026 1025 - static inline int do_essa(struct kvm_vcpu *vcpu, const int orc) 1027 + /* 1028 + * Must be called with relevant read locks held (kvm->mm->mmap_sem, kvm->srcu) 1029 + */ 1030 + static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc) 1026 1031 { 1027 - struct kvm_s390_migration_state *ms = vcpu->kvm->arch.migration_state; 1028 1032 int r1, r2, nappended, entries; 1029 1033 unsigned long gfn, hva, res, pgstev, ptev; 1030 1034 unsigned long *cbrlo; ··· 1076 1076 cbrlo[entries] = gfn << PAGE_SHIFT; 1077 1077 } 1078 1078 1079 - if (orc && gfn < ms->bitmap_size) { 1080 - /* increment only if we are really flipping the bit to 1 */ 1081 - if (!test_and_set_bit(gfn, ms->pgste_bitmap)) 1082 - atomic64_inc(&ms->dirty_pages); 1079 + if (orc) { 1080 + struct kvm_memory_slot *ms = gfn_to_memslot(vcpu->kvm, gfn); 1081 + 1082 + /* Increment only if we are really flipping the bit */ 1083 + if (ms && !test_and_set_bit(gfn - ms->base_gfn, kvm_second_dirty_bitmap(ms))) 1084 + atomic64_inc(&vcpu->kvm->arch.cmma_dirty_pages); 1083 1085 } 1084 1086 1085 1087 return nappended; ··· 1110 1108 : ESSA_SET_STABLE_IF_RESIDENT)) 1111 1109 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 1112 1110 1113 - if (likely(!vcpu->kvm->arch.migration_state)) { 1111 + if (!vcpu->kvm->arch.migration_mode) { 1114 1112 /* 1115 1113 * CMMA is enabled in the KVM settings, but is disabled in 1116 1114 * the SIE block and in the mm_context, and we are not doing ··· 1138 1136 /* Retry the ESSA instruction */ 1139 1137 kvm_s390_retry_instr(vcpu); 1140 1138 } else { 1141 - /* Account for the possible extra cbrl entry */ 1142 - i = do_essa(vcpu, orc); 1139 + int srcu_idx; 1140 + 1141 + down_read(&vcpu->kvm->mm->mmap_sem); 1142 + srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 1143 + i = __do_essa(vcpu, orc); 1144 + srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); 1145 + up_read(&vcpu->kvm->mm->mmap_sem); 1143 1146 if (i < 0) 1144 1147 return i; 1148 + /* Account for the possible extra cbrl entry */ 1145 1149 entries += i; 1146 1150 } 1147 1151 vcpu->arch.sie_block->cbrlo &= PAGE_MASK; /* reset nceo */

+9 -2

arch/s390/kvm/vsie.c

··· 2 2 /* 3 3 * kvm nested virtualization support for s390x 4 4 * 5 - * Copyright IBM Corp. 2016 5 + * Copyright IBM Corp. 2016, 2018 6 6 * 7 7 * Author(s): David Hildenbrand <dahi@linux.vnet.ibm.com> 8 8 */ ··· 378 378 if (test_kvm_facility(vcpu->kvm, 139)) 379 379 scb_s->ecd |= scb_o->ecd & ECD_MEF; 380 380 381 + /* etoken */ 382 + if (test_kvm_facility(vcpu->kvm, 156)) 383 + scb_s->ecd |= scb_o->ecd & ECD_ETOKENF; 384 + 381 385 prepare_ibc(vcpu, vsie_page); 382 386 rc = shadow_crycb(vcpu, vsie_page); 383 387 out: ··· 631 627 vsie_page->riccbd_gpa = gpa; 632 628 scb_s->riccbd = hpa; 633 629 } 634 - if ((scb_s->ecb & ECB_GS) && !(scb_s->ecd & ECD_HOSTREGMGMT)) { 630 + if (((scb_s->ecb & ECB_GS) && !(scb_s->ecd & ECD_HOSTREGMGMT)) || 631 + (scb_s->ecd & ECD_ETOKENF)) { 635 632 unsigned long sdnxc; 636 633 637 634 gpa = READ_ONCE(scb_o->sdnxo) & ~0xfUL; ··· 823 818 * - < 0 if an error occurred 824 819 */ 825 820 static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) 821 + __releases(vcpu->kvm->srcu) 822 + __acquires(vcpu->kvm->srcu) 826 823 { 827 824 struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; 828 825 struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;

+2 -1

arch/s390/tools/gen_facilities.c

··· 4 4 * numbering scheme from the Princples of Operations: most significant bit 5 5 * has bit number 0. 6 6 * 7 - * Copyright IBM Corp. 2015 7 + * Copyright IBM Corp. 2015, 2018 8 8 * 9 9 */ 10 10 ··· 106 106 107 107 .name = "FACILITIES_KVM_CPUMODEL", 108 108 .bits = (int[]){ 109 + 156, /* etoken facility */ 109 110 -1 /* END */ 110 111 } 111 112 },

+1 -1

arch/x86/hyperv/Makefile

··· 1 - obj-y := hv_init.o mmu.o 1 + obj-y := hv_init.o mmu.o nested.o 2 2 obj-$(CONFIG_X86_64) += hv_apic.o

+56

arch/x86/hyperv/nested.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + /* 4 + * Hyper-V nested virtualization code. 5 + * 6 + * Copyright (C) 2018, Microsoft, Inc. 7 + * 8 + * Author : Lan Tianyu <Tianyu.Lan@microsoft.com> 9 + */ 10 + 11 + 12 + #include <linux/types.h> 13 + #include <asm/hyperv-tlfs.h> 14 + #include <asm/mshyperv.h> 15 + #include <asm/tlbflush.h> 16 + 17 + #include <asm/trace/hyperv.h> 18 + 19 + int hyperv_flush_guest_mapping(u64 as) 20 + { 21 + struct hv_guest_mapping_flush **flush_pcpu; 22 + struct hv_guest_mapping_flush *flush; 23 + u64 status; 24 + unsigned long flags; 25 + int ret = -ENOTSUPP; 26 + 27 + if (!hv_hypercall_pg) 28 + goto fault; 29 + 30 + local_irq_save(flags); 31 + 32 + flush_pcpu = (struct hv_guest_mapping_flush **) 33 + this_cpu_ptr(hyperv_pcpu_input_arg); 34 + 35 + flush = *flush_pcpu; 36 + 37 + if (unlikely(!flush)) { 38 + local_irq_restore(flags); 39 + goto fault; 40 + } 41 + 42 + flush->address_space = as; 43 + flush->flags = 0; 44 + 45 + status = hv_do_hypercall(HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE, 46 + flush, NULL); 47 + local_irq_restore(flags); 48 + 49 + if (!(status & HV_HYPERCALL_RESULT_MASK)) 50 + ret = 0; 51 + 52 + fault: 53 + trace_hyperv_nested_flush_guest_mapping(as, ret); 54 + return ret; 55 + } 56 + EXPORT_SYMBOL_GPL(hyperv_flush_guest_mapping);

+8

arch/x86/include/asm/hyperv-tlfs.h

··· 310 310 #define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106 311 311 312 312 /* Nested features (CPUID 0x4000000A) EAX */ 313 + #define HV_X64_NESTED_GUEST_MAPPING_FLUSH BIT(18) 313 314 #define HV_X64_NESTED_MSR_BITMAP BIT(19) 314 315 315 316 struct hv_reenlightenment_control { ··· 352 351 #define HVCALL_SEND_IPI_EX 0x0015 353 352 #define HVCALL_POST_MESSAGE 0x005c 354 353 #define HVCALL_SIGNAL_EVENT 0x005d 354 + #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af 355 355 356 356 #define HV_X64_MSR_VP_ASSIST_PAGE_ENABLE 0x00000001 357 357 #define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT 12 ··· 742 740 u32 vector; 743 741 u32 reserved; 744 742 struct hv_vpset vp_set; 743 + }; 744 + 745 + /* HvFlushGuestPhysicalAddressSpace hypercalls */ 746 + struct hv_guest_mapping_flush { 747 + u64 address_space; 748 + u64 flags; 745 749 }; 746 750 747 751 /* HvFlushVirtualAddressSpace, HvFlushVirtualAddressList hypercalls */

+52 -4

arch/x86/include/asm/kvm_host.h

··· 55 55 #define KVM_REQ_TRIPLE_FAULT KVM_ARCH_REQ(2) 56 56 #define KVM_REQ_MMU_SYNC KVM_ARCH_REQ(3) 57 57 #define KVM_REQ_CLOCK_UPDATE KVM_ARCH_REQ(4) 58 + #define KVM_REQ_LOAD_CR3 KVM_ARCH_REQ(5) 58 59 #define KVM_REQ_EVENT KVM_ARCH_REQ(6) 59 60 #define KVM_REQ_APF_HALT KVM_ARCH_REQ(7) 60 61 #define KVM_REQ_STEAL_UPDATE KVM_ARCH_REQ(8) ··· 77 76 #define KVM_REQ_HV_EXIT KVM_ARCH_REQ(21) 78 77 #define KVM_REQ_HV_STIMER KVM_ARCH_REQ(22) 79 78 #define KVM_REQ_LOAD_EOI_EXITMAP KVM_ARCH_REQ(23) 79 + #define KVM_REQ_GET_VMCS12_PAGES KVM_ARCH_REQ(24) 80 80 81 81 #define CR0_RESERVED_BITS \ 82 82 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ 83 83 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ 84 84 | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) 85 85 86 - #define CR3_PCID_INVD BIT_64(63) 87 86 #define CR4_RESERVED_BITS \ 88 87 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ 89 88 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ ··· 327 326 u64 bad_mt_xwr; 328 327 }; 329 328 329 + struct kvm_mmu_root_info { 330 + gpa_t cr3; 331 + hpa_t hpa; 332 + }; 333 + 334 + #define KVM_MMU_ROOT_INFO_INVALID \ 335 + ((struct kvm_mmu_root_info) { .cr3 = INVALID_PAGE, .hpa = INVALID_PAGE }) 336 + 337 + #define KVM_MMU_NUM_PREV_ROOTS 3 338 + 330 339 /* 331 340 * x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit, 332 341 * and 2-level 32-bit). The kvm_mmu structure abstracts the details of the ··· 356 345 struct x86_exception *exception); 357 346 int (*sync_page)(struct kvm_vcpu *vcpu, 358 347 struct kvm_mmu_page *sp); 359 - void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); 348 + void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa); 360 349 void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 361 350 u64 *spte, const void *pte); 362 351 hpa_t root_hpa; ··· 365 354 u8 shadow_root_level; 366 355 u8 ept_ad; 367 356 bool direct_map; 357 + struct kvm_mmu_root_info prev_roots[KVM_MMU_NUM_PREV_ROOTS]; 368 358 369 359 /* 370 360 * Bitmap; bit set = permission fault ··· 990 978 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); 991 979 992 980 void (*tlb_flush)(struct kvm_vcpu *vcpu, bool invalidate_gpa); 981 + int (*tlb_remote_flush)(struct kvm *kvm); 982 + 983 + /* 984 + * Flush any TLB entries associated with the given GVA. 985 + * Does not need to flush GPA->HPA mappings. 986 + * Can potentially get non-canonical addresses through INVLPGs, which 987 + * the implementation may choose to ignore if appropriate. 988 + */ 989 + void (*tlb_flush_gva)(struct kvm_vcpu *vcpu, gva_t addr); 993 990 994 991 void (*run)(struct kvm_vcpu *vcpu); 995 992 int (*handle_exit)(struct kvm_vcpu *vcpu); ··· 1111 1090 1112 1091 void (*setup_mce)(struct kvm_vcpu *vcpu); 1113 1092 1093 + int (*get_nested_state)(struct kvm_vcpu *vcpu, 1094 + struct kvm_nested_state __user *user_kvm_nested_state, 1095 + unsigned user_data_size); 1096 + int (*set_nested_state)(struct kvm_vcpu *vcpu, 1097 + struct kvm_nested_state __user *user_kvm_nested_state, 1098 + struct kvm_nested_state *kvm_state); 1099 + void (*get_vmcs12_pages)(struct kvm_vcpu *vcpu); 1100 + 1114 1101 int (*smi_allowed)(struct kvm_vcpu *vcpu); 1115 1102 int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate); 1116 1103 int (*pre_leave_smm)(struct kvm_vcpu *vcpu, u64 smbase); ··· 1149 1120 static inline void kvm_arch_free_vm(struct kvm *kvm) 1150 1121 { 1151 1122 return kvm_x86_ops->vm_free(kvm); 1123 + } 1124 + 1125 + #define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB 1126 + static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm) 1127 + { 1128 + if (kvm_x86_ops->tlb_remote_flush && 1129 + !kvm_x86_ops->tlb_remote_flush(kvm)) 1130 + return 0; 1131 + else 1132 + return -ENOTSUPP; 1152 1133 } 1153 1134 1154 1135 int kvm_mmu_module_init(void); ··· 1312 1273 return !!(*irq_state); 1313 1274 } 1314 1275 1276 + #define KVM_MMU_ROOT_CURRENT BIT(0) 1277 + #define KVM_MMU_ROOT_PREVIOUS(i) BIT(1+i) 1278 + #define KVM_MMU_ROOTS_ALL (~0UL) 1279 + 1315 1280 int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level); 1316 1281 void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id); 1317 1282 ··· 1327 1284 int kvm_mmu_load(struct kvm_vcpu *vcpu); 1328 1285 void kvm_mmu_unload(struct kvm_vcpu *vcpu); 1329 1286 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); 1330 - void kvm_mmu_free_roots(struct kvm_vcpu *vcpu); 1287 + void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, ulong roots_to_free); 1331 1288 gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, 1332 1289 struct x86_exception *exception); 1333 1290 gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, ··· 1346 1303 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64 error_code, 1347 1304 void *insn, int insn_len); 1348 1305 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); 1349 - void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu); 1306 + void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); 1307 + void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush); 1350 1308 1351 1309 void kvm_enable_tdp(void); 1352 1310 void kvm_disable_tdp(void); ··· 1461 1417 int kvm_cpu_get_interrupt(struct kvm_vcpu *v); 1462 1418 void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); 1463 1419 void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu); 1420 + 1421 + int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, 1422 + unsigned long ipi_bitmap_high, int min, 1423 + unsigned long icr, int op_64_bit); 1464 1424 1465 1425 u64 kvm_get_arch_capabilities(void); 1466 1426 void kvm_define_shared_msr(unsigned index, u32 msr);

+2

arch/x86/include/asm/mshyperv.h

··· 347 347 void set_hv_tscchange_cb(void (*cb)(void)); 348 348 void clear_hv_tscchange_cb(void); 349 349 void hyperv_stop_tsc_emulation(void); 350 + int hyperv_flush_guest_mapping(u64 as); 350 351 351 352 #ifdef CONFIG_X86_64 352 353 void hv_apic_init(void); ··· 367 366 { 368 367 return NULL; 369 368 } 369 + static inline int hyperv_flush_guest_mapping(u64 as) { return -1; } 370 370 #endif /* CONFIG_HYPERV */ 371 371 372 372 #ifdef CONFIG_HYPERV_TSCPAGE

+14

arch/x86/include/asm/trace/hyperv.h

··· 28 28 __entry->addr, __entry->end) 29 29 ); 30 30 31 + TRACE_EVENT(hyperv_nested_flush_guest_mapping, 32 + TP_PROTO(u64 as, int ret), 33 + TP_ARGS(as, ret), 34 + 35 + TP_STRUCT__entry( 36 + __field(u64, as) 37 + __field(int, ret) 38 + ), 39 + TP_fast_assign(__entry->as = as; 40 + __entry->ret = ret; 41 + ), 42 + TP_printk("address space %llx ret %d", __entry->as, __entry->ret) 43 + ); 44 + 31 45 TRACE_EVENT(hyperv_send_ipi_mask, 32 46 TP_PROTO(const struct cpumask *cpus, 33 47 int vector),

+37

arch/x86/include/uapi/asm/kvm.h

··· 378 378 #define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0) 379 379 #define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1) 380 380 381 + #define KVM_STATE_NESTED_GUEST_MODE 0x00000001 382 + #define KVM_STATE_NESTED_RUN_PENDING 0x00000002 383 + 384 + #define KVM_STATE_NESTED_SMM_GUEST_MODE 0x00000001 385 + #define KVM_STATE_NESTED_SMM_VMXON 0x00000002 386 + 387 + struct kvm_vmx_nested_state { 388 + __u64 vmxon_pa; 389 + __u64 vmcs_pa; 390 + 391 + struct { 392 + __u16 flags; 393 + } smm; 394 + }; 395 + 396 + /* for KVM_CAP_NESTED_STATE */ 397 + struct kvm_nested_state { 398 + /* KVM_STATE_* flags */ 399 + __u16 flags; 400 + 401 + /* 0 for VMX, 1 for SVM. */ 402 + __u16 format; 403 + 404 + /* 128 for SVM, 128 + VMCS size for VMX. */ 405 + __u32 size; 406 + 407 + union { 408 + /* VMXON, VMCS */ 409 + struct kvm_vmx_nested_state vmx; 410 + 411 + /* Pad the header to 128 bytes. */ 412 + __u8 pad[120]; 413 + }; 414 + 415 + __u8 data[0]; 416 + }; 417 + 381 418 #endif /* _ASM_X86_KVM_H */

+1

arch/x86/include/uapi/asm/kvm_para.h

··· 28 28 #define KVM_FEATURE_PV_UNHALT 7 29 29 #define KVM_FEATURE_PV_TLB_FLUSH 9 30 30 #define KVM_FEATURE_ASYNC_PF_VMEXIT 10 31 + #define KVM_FEATURE_PV_SEND_IPI 11 31 32 32 33 #define KVM_HINTS_REALTIME 0 33 34

+111 -1

arch/x86/kernel/kvm.c

··· 444 444 } 445 445 446 446 #ifdef CONFIG_SMP 447 + #define KVM_IPI_CLUSTER_SIZE (2 * BITS_PER_LONG) 448 + 449 + static void __send_ipi_mask(const struct cpumask *mask, int vector) 450 + { 451 + unsigned long flags; 452 + int cpu, apic_id, icr; 453 + int min = 0, max = 0; 454 + #ifdef CONFIG_X86_64 455 + __uint128_t ipi_bitmap = 0; 456 + #else 457 + u64 ipi_bitmap = 0; 458 + #endif 459 + 460 + if (cpumask_empty(mask)) 461 + return; 462 + 463 + local_irq_save(flags); 464 + 465 + switch (vector) { 466 + default: 467 + icr = APIC_DM_FIXED | vector; 468 + break; 469 + case NMI_VECTOR: 470 + icr = APIC_DM_NMI; 471 + break; 472 + } 473 + 474 + for_each_cpu(cpu, mask) { 475 + apic_id = per_cpu(x86_cpu_to_apicid, cpu); 476 + if (!ipi_bitmap) { 477 + min = max = apic_id; 478 + } else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) { 479 + ipi_bitmap <<= min - apic_id; 480 + min = apic_id; 481 + } else if (apic_id < min + KVM_IPI_CLUSTER_SIZE) { 482 + max = apic_id < max ? max : apic_id; 483 + } else { 484 + kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap, 485 + (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr); 486 + min = max = apic_id; 487 + ipi_bitmap = 0; 488 + } 489 + __set_bit(apic_id - min, (unsigned long *)&ipi_bitmap); 490 + } 491 + 492 + if (ipi_bitmap) { 493 + kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap, 494 + (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr); 495 + } 496 + 497 + local_irq_restore(flags); 498 + } 499 + 500 + static void kvm_send_ipi_mask(const struct cpumask *mask, int vector) 501 + { 502 + __send_ipi_mask(mask, vector); 503 + } 504 + 505 + static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector) 506 + { 507 + unsigned int this_cpu = smp_processor_id(); 508 + struct cpumask new_mask; 509 + const struct cpumask *local_mask; 510 + 511 + cpumask_copy(&new_mask, mask); 512 + cpumask_clear_cpu(this_cpu, &new_mask); 513 + local_mask = &new_mask; 514 + __send_ipi_mask(local_mask, vector); 515 + } 516 + 517 + static void kvm_send_ipi_allbutself(int vector) 518 + { 519 + kvm_send_ipi_mask_allbutself(cpu_online_mask, vector); 520 + } 521 + 522 + static void kvm_send_ipi_all(int vector) 523 + { 524 + __send_ipi_mask(cpu_online_mask, vector); 525 + } 526 + 527 + /* 528 + * Set the IPI entry points 529 + */ 530 + static void kvm_setup_pv_ipi(void) 531 + { 532 + apic->send_IPI_mask = kvm_send_ipi_mask; 533 + apic->send_IPI_mask_allbutself = kvm_send_ipi_mask_allbutself; 534 + apic->send_IPI_allbutself = kvm_send_ipi_allbutself; 535 + apic->send_IPI_all = kvm_send_ipi_all; 536 + pr_info("KVM setup pv IPIs\n"); 537 + } 538 + 447 539 static void __init kvm_smp_prepare_cpus(unsigned int max_cpus) 448 540 { 449 541 native_smp_prepare_cpus(max_cpus); ··· 703 611 return kvm_cpuid_base(); 704 612 } 705 613 614 + static void __init kvm_apic_init(void) 615 + { 616 + #if defined(CONFIG_SMP) 617 + if (kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI)) 618 + kvm_setup_pv_ipi(); 619 + #endif 620 + } 621 + 622 + static void __init kvm_init_platform(void) 623 + { 624 + kvmclock_init(); 625 + x86_platform.apic_post_init = kvm_apic_init; 626 + } 627 + 706 628 const __initconst struct hypervisor_x86 x86_hyper_kvm = { 707 629 .name = "KVM", 708 630 .detect = kvm_detect, 709 631 .type = X86_HYPER_KVM, 710 - .init.init_platform = kvmclock_init, 711 632 .init.guest_late_init = kvm_guest_init, 712 633 .init.x2apic_available = kvm_para_available, 634 + .init.init_platform = kvm_init_platform, 713 635 }; 714 636 715 637 static __init int activate_jump_labels(void) ··· 840 734 return; 841 735 842 736 if (kvm_para_has_hint(KVM_HINTS_REALTIME)) 737 + return; 738 + 739 + /* Don't use the pvqspinlock code if there is only 1 vCPU. */ 740 + if (num_possible_cpus() == 1) 843 741 return; 844 742 845 743 __pv_init_lock_hash();

+2 -1

arch/x86/kvm/cpuid.c

··· 621 621 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) | 622 622 (1 << KVM_FEATURE_PV_UNHALT) | 623 623 (1 << KVM_FEATURE_PV_TLB_FLUSH) | 624 - (1 << KVM_FEATURE_ASYNC_PF_VMEXIT); 624 + (1 << KVM_FEATURE_ASYNC_PF_VMEXIT) | 625 + (1 << KVM_FEATURE_PV_SEND_IPI); 625 626 626 627 if (sched_info_on()) 627 628 entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);

+1 -1

arch/x86/kvm/emulate.c

··· 4191 4191 maxphyaddr = 36; 4192 4192 rsvd = rsvd_bits(maxphyaddr, 63); 4193 4193 if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_PCIDE) 4194 - rsvd &= ~CR3_PCID_INVD; 4194 + rsvd &= ~X86_CR3_PCID_NOFLUSH; 4195 4195 } 4196 4196 4197 4197 if (new_val & rsvd)

+20 -7

arch/x86/kvm/hyperv.c

··· 235 235 struct kvm_vcpu *vcpu = synic_to_vcpu(synic); 236 236 int ret; 237 237 238 - if (!synic->active) 238 + if (!synic->active && !host) 239 239 return 1; 240 240 241 241 trace_kvm_hv_synic_set_msr(vcpu->vcpu_id, msr, data, host); ··· 295 295 return ret; 296 296 } 297 297 298 - static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata) 298 + static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata, 299 + bool host) 299 300 { 300 301 int ret; 301 302 302 - if (!synic->active) 303 + if (!synic->active && !host) 303 304 return 1; 304 305 305 306 ret = 0; ··· 1015 1014 case HV_X64_MSR_TSC_EMULATION_STATUS: 1016 1015 hv->hv_tsc_emulation_status = data; 1017 1016 break; 1017 + case HV_X64_MSR_TIME_REF_COUNT: 1018 + /* read-only, but still ignore it if host-initiated */ 1019 + if (!host) 1020 + return 1; 1021 + break; 1018 1022 default: 1019 1023 vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", 1020 1024 msr, data); ··· 1107 1101 return stimer_set_count(vcpu_to_stimer(vcpu, timer_index), 1108 1102 data, host); 1109 1103 } 1104 + case HV_X64_MSR_TSC_FREQUENCY: 1105 + case HV_X64_MSR_APIC_FREQUENCY: 1106 + /* read-only, but still ignore it if host-initiated */ 1107 + if (!host) 1108 + return 1; 1109 + break; 1110 1110 default: 1111 1111 vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", 1112 1112 msr, data); ··· 1168 1156 return 0; 1169 1157 } 1170 1158 1171 - static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 1159 + static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, 1160 + bool host) 1172 1161 { 1173 1162 u64 data = 0; 1174 1163 struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; ··· 1196 1183 case HV_X64_MSR_SIMP: 1197 1184 case HV_X64_MSR_EOM: 1198 1185 case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: 1199 - return synic_get_msr(vcpu_to_synic(vcpu), msr, pdata); 1186 + return synic_get_msr(vcpu_to_synic(vcpu), msr, pdata, host); 1200 1187 case HV_X64_MSR_STIMER0_CONFIG: 1201 1188 case HV_X64_MSR_STIMER1_CONFIG: 1202 1189 case HV_X64_MSR_STIMER2_CONFIG: ··· 1242 1229 return kvm_hv_set_msr(vcpu, msr, data, host); 1243 1230 } 1244 1231 1245 - int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 1232 + int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) 1246 1233 { 1247 1234 if (kvm_hv_msr_partition_wide(msr)) { 1248 1235 int r; ··· 1252 1239 mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock); 1253 1240 return r; 1254 1241 } else 1255 - return kvm_hv_get_msr(vcpu, msr, pdata); 1242 + return kvm_hv_get_msr(vcpu, msr, pdata, host); 1256 1243 } 1257 1244 1258 1245 static __always_inline int get_sparse_bank_no(u64 valid_bank_mask, int bank_no)

+1 -1

arch/x86/kvm/hyperv.h

··· 48 48 } 49 49 50 50 int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host); 51 - int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); 51 + int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host); 52 52 53 53 bool kvm_hv_hypercall_enabled(struct kvm *kvm); 54 54 int kvm_hv_hypercall(struct kvm_vcpu *vcpu);

+40

arch/x86/kvm/lapic.c

··· 547 547 irq->level, irq->trig_mode, dest_map); 548 548 } 549 549 550 + int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, 551 + unsigned long ipi_bitmap_high, int min, 552 + unsigned long icr, int op_64_bit) 553 + { 554 + int i; 555 + struct kvm_apic_map *map; 556 + struct kvm_vcpu *vcpu; 557 + struct kvm_lapic_irq irq = {0}; 558 + int cluster_size = op_64_bit ? 64 : 32; 559 + int count = 0; 560 + 561 + irq.vector = icr & APIC_VECTOR_MASK; 562 + irq.delivery_mode = icr & APIC_MODE_MASK; 563 + irq.level = (icr & APIC_INT_ASSERT) != 0; 564 + irq.trig_mode = icr & APIC_INT_LEVELTRIG; 565 + 566 + if (icr & APIC_DEST_MASK) 567 + return -KVM_EINVAL; 568 + if (icr & APIC_SHORT_MASK) 569 + return -KVM_EINVAL; 570 + 571 + rcu_read_lock(); 572 + map = rcu_dereference(kvm->arch.apic_map); 573 + 574 + /* Bits above cluster_size are masked in the caller. */ 575 + for_each_set_bit(i, &ipi_bitmap_low, BITS_PER_LONG) { 576 + vcpu = map->phys_map[min + i]->vcpu; 577 + count += kvm_apic_set_irq(vcpu, &irq, NULL); 578 + } 579 + 580 + min += cluster_size; 581 + for_each_set_bit(i, &ipi_bitmap_high, BITS_PER_LONG) { 582 + vcpu = map->phys_map[min + i]->vcpu; 583 + count += kvm_apic_set_irq(vcpu, &irq, NULL); 584 + } 585 + 586 + rcu_read_unlock(); 587 + return count; 588 + } 589 + 550 590 static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val) 551 591 { 552 592

+442 -89

arch/x86/kvm/mmu.c

··· 178 178 unsigned index; 179 179 }; 180 180 181 - #define for_each_shadow_entry(_vcpu, _addr, _walker) \ 181 + static const union kvm_mmu_page_role mmu_base_role_mask = { 182 + .cr0_wp = 1, 183 + .cr4_pae = 1, 184 + .nxe = 1, 185 + .smep_andnot_wp = 1, 186 + .smap_andnot_wp = 1, 187 + .smm = 1, 188 + .guest_mode = 1, 189 + .ad_disabled = 1, 190 + }; 191 + 192 + #define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker) \ 193 + for (shadow_walk_init_using_root(&(_walker), (_vcpu), \ 194 + (_root), (_addr)); \ 195 + shadow_walk_okay(&(_walker)); \ 196 + shadow_walk_next(&(_walker))) 197 + 198 + #define for_each_shadow_entry(_vcpu, _addr, _walker) \ 182 199 for (shadow_walk_init(&(_walker), _vcpu, _addr); \ 183 200 shadow_walk_okay(&(_walker)); \ 184 201 shadow_walk_next(&(_walker))) ··· 238 221 PT64_EPT_EXECUTABLE_MASK; 239 222 static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT; 240 223 224 + /* 225 + * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order 226 + * to guard against L1TF attacks. 227 + */ 228 + static u64 __read_mostly shadow_nonpresent_or_rsvd_mask; 229 + 230 + /* 231 + * The number of high-order 1 bits to use in the mask above. 232 + */ 233 + static const u64 shadow_nonpresent_or_rsvd_mask_len = 5; 234 + 241 235 static void mmu_spte_set(u64 *sptep, u64 spte); 236 + static union kvm_mmu_page_role 237 + kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu); 242 238 243 239 void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value) 244 240 { ··· 338 308 { 339 309 unsigned int gen = kvm_current_mmio_generation(vcpu); 340 310 u64 mask = generation_mmio_spte_mask(gen); 311 + u64 gpa = gfn << PAGE_SHIFT; 341 312 342 313 access &= ACC_WRITE_MASK | ACC_USER_MASK; 343 - mask |= shadow_mmio_value | access | gfn << PAGE_SHIFT; 314 + mask |= shadow_mmio_value | access; 315 + mask |= gpa | shadow_nonpresent_or_rsvd_mask; 316 + mask |= (gpa & shadow_nonpresent_or_rsvd_mask) 317 + << shadow_nonpresent_or_rsvd_mask_len; 344 318 345 319 trace_mark_mmio_spte(sptep, gfn, access, gen); 346 320 mmu_spte_set(sptep, mask); ··· 357 323 358 324 static gfn_t get_mmio_spte_gfn(u64 spte) 359 325 { 360 - u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask; 361 - return (spte & ~mask) >> PAGE_SHIFT; 326 + u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask | 327 + shadow_nonpresent_or_rsvd_mask; 328 + u64 gpa = spte & ~mask; 329 + 330 + gpa |= (spte >> shadow_nonpresent_or_rsvd_mask_len) 331 + & shadow_nonpresent_or_rsvd_mask; 332 + 333 + return gpa >> PAGE_SHIFT; 362 334 } 363 335 364 336 static unsigned get_mmio_spte_access(u64 spte) ··· 421 381 } 422 382 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); 423 383 424 - static void kvm_mmu_clear_all_pte_masks(void) 384 + static void kvm_mmu_reset_all_pte_masks(void) 425 385 { 426 386 shadow_user_mask = 0; 427 387 shadow_accessed_mask = 0; ··· 431 391 shadow_mmio_mask = 0; 432 392 shadow_present_mask = 0; 433 393 shadow_acc_track_mask = 0; 394 + 395 + /* 396 + * If the CPU has 46 or less physical address bits, then set an 397 + * appropriate mask to guard against L1TF attacks. Otherwise, it is 398 + * assumed that the CPU is not vulnerable to L1TF. 399 + */ 400 + if (boot_cpu_data.x86_phys_bits < 401 + 52 - shadow_nonpresent_or_rsvd_mask_len) 402 + shadow_nonpresent_or_rsvd_mask = 403 + rsvd_bits(boot_cpu_data.x86_phys_bits - 404 + shadow_nonpresent_or_rsvd_mask_len, 405 + boot_cpu_data.x86_phys_bits - 1); 434 406 } 435 407 436 408 static int is_cpuid_PSE36(void) ··· 2038 1986 return 0; 2039 1987 } 2040 1988 2041 - static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) 1989 + static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root) 2042 1990 { 2043 1991 } 2044 1992 ··· 2169 2117 static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 2170 2118 struct list_head *invalid_list) 2171 2119 { 2172 - if (sp->role.cr4_pae != !!is_pae(vcpu)) { 2173 - kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); 2174 - return false; 2175 - } 2176 - 2177 - if (vcpu->arch.mmu.sync_page(vcpu, sp) == 0) { 2120 + if (sp->role.cr4_pae != !!is_pae(vcpu) 2121 + || vcpu->arch.mmu.sync_page(vcpu, sp) == 0) { 2178 2122 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); 2179 2123 return false; 2180 2124 } ··· 2440 2392 return sp; 2441 2393 } 2442 2394 2443 - static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, 2444 - struct kvm_vcpu *vcpu, u64 addr) 2395 + static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator, 2396 + struct kvm_vcpu *vcpu, hpa_t root, 2397 + u64 addr) 2445 2398 { 2446 2399 iterator->addr = addr; 2447 - iterator->shadow_addr = vcpu->arch.mmu.root_hpa; 2400 + iterator->shadow_addr = root; 2448 2401 iterator->level = vcpu->arch.mmu.shadow_root_level; 2449 2402 2450 2403 if (iterator->level == PT64_ROOT_4LEVEL && ··· 2454 2405 --iterator->level; 2455 2406 2456 2407 if (iterator->level == PT32E_ROOT_LEVEL) { 2408 + /* 2409 + * prev_root is currently only used for 64-bit hosts. So only 2410 + * the active root_hpa is valid here. 2411 + */ 2412 + BUG_ON(root != vcpu->arch.mmu.root_hpa); 2413 + 2457 2414 iterator->shadow_addr 2458 2415 = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; 2459 2416 iterator->shadow_addr &= PT64_BASE_ADDR_MASK; ··· 2467 2412 if (!iterator->shadow_addr) 2468 2413 iterator->level = 0; 2469 2414 } 2415 + } 2416 + 2417 + static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, 2418 + struct kvm_vcpu *vcpu, u64 addr) 2419 + { 2420 + shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu.root_hpa, 2421 + addr); 2470 2422 } 2471 2423 2472 2424 static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) ··· 2764 2702 kvm_unsync_page(vcpu, sp); 2765 2703 } 2766 2704 2705 + /* 2706 + * We need to ensure that the marking of unsync pages is visible 2707 + * before the SPTE is updated to allow writes because 2708 + * kvm_mmu_sync_roots() checks the unsync flags without holding 2709 + * the MMU lock and so can race with this. If the SPTE was updated 2710 + * before the page had been marked as unsync-ed, something like the 2711 + * following could happen: 2712 + * 2713 + * CPU 1 CPU 2 2714 + * --------------------------------------------------------------------- 2715 + * 1.2 Host updates SPTE 2716 + * to be writable 2717 + * 2.1 Guest writes a GPTE for GVA X. 2718 + * (GPTE being in the guest page table shadowed 2719 + * by the SP from CPU 1.) 2720 + * This reads SPTE during the page table walk. 2721 + * Since SPTE.W is read as 1, there is no 2722 + * fault. 2723 + * 2724 + * 2.2 Guest issues TLB flush. 2725 + * That causes a VM Exit. 2726 + * 2727 + * 2.3 kvm_mmu_sync_pages() reads sp->unsync. 2728 + * Since it is false, so it just returns. 2729 + * 2730 + * 2.4 Guest accesses GVA X. 2731 + * Since the mapping in the SP was not updated, 2732 + * so the old mapping for GVA X incorrectly 2733 + * gets used. 2734 + * 1.1 Host marks SP 2735 + * as unsync 2736 + * (sp->unsync = true) 2737 + * 2738 + * The write barrier below ensures that 1.1 happens before 1.2 and thus 2739 + * the situation in 2.4 does not arise. The implicit barrier in 2.2 2740 + * pairs with this write barrier. 2741 + */ 2742 + smp_wmb(); 2743 + 2767 2744 return false; 2768 2745 } 2769 2746 ··· 2824 2723 2825 2724 return true; 2826 2725 } 2726 + 2727 + /* Bits which may be returned by set_spte() */ 2728 + #define SET_SPTE_WRITE_PROTECTED_PT BIT(0) 2729 + #define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) 2827 2730 2828 2731 static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, 2829 2732 unsigned pte_access, int level, ··· 2905 2800 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { 2906 2801 pgprintk("%s: found shadow page for %llx, marking ro\n", 2907 2802 __func__, gfn); 2908 - ret = 1; 2803 + ret |= SET_SPTE_WRITE_PROTECTED_PT; 2909 2804 pte_access &= ~ACC_WRITE_MASK; 2910 2805 spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); 2911 2806 } ··· 2921 2816 2922 2817 set_pte: 2923 2818 if (mmu_spte_update(sptep, spte)) 2924 - kvm_flush_remote_tlbs(vcpu->kvm); 2819 + ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH; 2925 2820 done: 2926 2821 return ret; 2927 2822 } ··· 2932 2827 { 2933 2828 int was_rmapped = 0; 2934 2829 int rmap_count; 2830 + int set_spte_ret; 2935 2831 int ret = RET_PF_RETRY; 2832 + bool flush = false; 2936 2833 2937 2834 pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, 2938 2835 *sptep, write_fault, gfn); ··· 2951 2844 2952 2845 child = page_header(pte & PT64_BASE_ADDR_MASK); 2953 2846 drop_parent_pte(child, sptep); 2954 - kvm_flush_remote_tlbs(vcpu->kvm); 2847 + flush = true; 2955 2848 } else if (pfn != spte_to_pfn(*sptep)) { 2956 2849 pgprintk("hfn old %llx new %llx\n", 2957 2850 spte_to_pfn(*sptep), pfn); 2958 2851 drop_spte(vcpu->kvm, sptep); 2959 - kvm_flush_remote_tlbs(vcpu->kvm); 2852 + flush = true; 2960 2853 } else 2961 2854 was_rmapped = 1; 2962 2855 } 2963 2856 2964 - if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative, 2965 - true, host_writable)) { 2857 + set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn, 2858 + speculative, true, host_writable); 2859 + if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) { 2966 2860 if (write_fault) 2967 2861 ret = RET_PF_EMULATE; 2968 2862 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 2969 2863 } 2864 + if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush) 2865 + kvm_flush_remote_tlbs(vcpu->kvm); 2970 2866 2971 2867 if (unlikely(is_mmio_spte(*sptep))) 2972 2868 ret = RET_PF_EMULATE; ··· 3468 3358 *root_hpa = INVALID_PAGE; 3469 3359 } 3470 3360 3471 - void kvm_mmu_free_roots(struct kvm_vcpu *vcpu) 3361 + /* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */ 3362 + void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, ulong roots_to_free) 3472 3363 { 3473 3364 int i; 3474 3365 LIST_HEAD(invalid_list); 3475 3366 struct kvm_mmu *mmu = &vcpu->arch.mmu; 3367 + bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT; 3476 3368 3477 - if (!VALID_PAGE(mmu->root_hpa)) 3478 - return; 3369 + BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG); 3370 + 3371 + /* Before acquiring the MMU lock, see if we need to do any real work. */ 3372 + if (!(free_active_root && VALID_PAGE(mmu->root_hpa))) { 3373 + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) 3374 + if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) && 3375 + VALID_PAGE(mmu->prev_roots[i].hpa)) 3376 + break; 3377 + 3378 + if (i == KVM_MMU_NUM_PREV_ROOTS) 3379 + return; 3380 + } 3479 3381 3480 3382 spin_lock(&vcpu->kvm->mmu_lock); 3481 3383 3482 - if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && 3483 - (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) { 3484 - mmu_free_root_page(vcpu->kvm, &mmu->root_hpa, &invalid_list); 3485 - } else { 3486 - for (i = 0; i < 4; ++i) 3487 - if (mmu->pae_root[i] != 0) 3488 - mmu_free_root_page(vcpu->kvm, &mmu->pae_root[i], 3489 - &invalid_list); 3490 - mmu->root_hpa = INVALID_PAGE; 3384 + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) 3385 + if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) 3386 + mmu_free_root_page(vcpu->kvm, &mmu->prev_roots[i].hpa, 3387 + &invalid_list); 3388 + 3389 + if (free_active_root) { 3390 + if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && 3391 + (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) { 3392 + mmu_free_root_page(vcpu->kvm, &mmu->root_hpa, 3393 + &invalid_list); 3394 + } else { 3395 + for (i = 0; i < 4; ++i) 3396 + if (mmu->pae_root[i] != 0) 3397 + mmu_free_root_page(vcpu->kvm, 3398 + &mmu->pae_root[i], 3399 + &invalid_list); 3400 + mmu->root_hpa = INVALID_PAGE; 3401 + } 3491 3402 } 3492 3403 3493 3404 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); ··· 3677 3546 return mmu_alloc_shadow_roots(vcpu); 3678 3547 } 3679 3548 3680 - static void mmu_sync_roots(struct kvm_vcpu *vcpu) 3549 + void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) 3681 3550 { 3682 3551 int i; 3683 3552 struct kvm_mmu_page *sp; ··· 3689 3558 return; 3690 3559 3691 3560 vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); 3692 - kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); 3561 + 3693 3562 if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) { 3694 3563 hpa_t root = vcpu->arch.mmu.root_hpa; 3564 + 3695 3565 sp = page_header(root); 3566 + 3567 + /* 3568 + * Even if another CPU was marking the SP as unsync-ed 3569 + * simultaneously, any guest page table changes are not 3570 + * guaranteed to be visible anyway until this VCPU issues a TLB 3571 + * flush strictly after those changes are made. We only need to 3572 + * ensure that the other CPU sets these flags before any actual 3573 + * changes to the page tables are made. The comments in 3574 + * mmu_need_write_protect() describe what could go wrong if this 3575 + * requirement isn't satisfied. 3576 + */ 3577 + if (!smp_load_acquire(&sp->unsync) && 3578 + !smp_load_acquire(&sp->unsync_children)) 3579 + return; 3580 + 3581 + spin_lock(&vcpu->kvm->mmu_lock); 3582 + kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); 3583 + 3696 3584 mmu_sync_children(vcpu, sp); 3585 + 3697 3586 kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); 3587 + spin_unlock(&vcpu->kvm->mmu_lock); 3698 3588 return; 3699 3589 } 3590 + 3591 + spin_lock(&vcpu->kvm->mmu_lock); 3592 + kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); 3593 + 3700 3594 for (i = 0; i < 4; ++i) { 3701 3595 hpa_t root = vcpu->arch.mmu.pae_root[i]; 3702 3596 ··· 3731 3575 mmu_sync_children(vcpu, sp); 3732 3576 } 3733 3577 } 3734 - kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); 3735 - } 3736 3578 3737 - void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) 3738 - { 3739 - spin_lock(&vcpu->kvm->mmu_lock); 3740 - mmu_sync_roots(vcpu); 3579 + kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); 3741 3580 spin_unlock(&vcpu->kvm->mmu_lock); 3742 3581 } 3743 3582 EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots); ··· 4099 3948 context->update_pte = nonpaging_update_pte; 4100 3949 context->root_level = 0; 4101 3950 context->shadow_root_level = PT32E_ROOT_LEVEL; 4102 - context->root_hpa = INVALID_PAGE; 4103 3951 context->direct_map = true; 4104 3952 context->nx = false; 4105 3953 } 4106 3954 4107 - void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu) 3955 + /* 3956 + * Find out if a previously cached root matching the new CR3/role is available. 3957 + * The current root is also inserted into the cache. 3958 + * If a matching root was found, it is assigned to kvm_mmu->root_hpa and true is 3959 + * returned. 3960 + * Otherwise, the LRU root from the cache is assigned to kvm_mmu->root_hpa and 3961 + * false is returned. This root should now be freed by the caller. 3962 + */ 3963 + static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3, 3964 + union kvm_mmu_page_role new_role) 4108 3965 { 4109 - kvm_mmu_free_roots(vcpu); 3966 + uint i; 3967 + struct kvm_mmu_root_info root; 3968 + struct kvm_mmu *mmu = &vcpu->arch.mmu; 3969 + 3970 + root.cr3 = mmu->get_cr3(vcpu); 3971 + root.hpa = mmu->root_hpa; 3972 + 3973 + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 3974 + swap(root, mmu->prev_roots[i]); 3975 + 3976 + if (new_cr3 == root.cr3 && VALID_PAGE(root.hpa) && 3977 + page_header(root.hpa) != NULL && 3978 + new_role.word == page_header(root.hpa)->role.word) 3979 + break; 3980 + } 3981 + 3982 + mmu->root_hpa = root.hpa; 3983 + 3984 + return i < KVM_MMU_NUM_PREV_ROOTS; 4110 3985 } 3986 + 3987 + static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, 3988 + union kvm_mmu_page_role new_role, 3989 + bool skip_tlb_flush) 3990 + { 3991 + struct kvm_mmu *mmu = &vcpu->arch.mmu; 3992 + 3993 + /* 3994 + * For now, limit the fast switch to 64-bit hosts+VMs in order to avoid 3995 + * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs 3996 + * later if necessary. 3997 + */ 3998 + if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && 3999 + mmu->root_level >= PT64_ROOT_4LEVEL) { 4000 + if (mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT)) 4001 + return false; 4002 + 4003 + if (cached_root_available(vcpu, new_cr3, new_role)) { 4004 + /* 4005 + * It is possible that the cached previous root page is 4006 + * obsolete because of a change in the MMU 4007 + * generation number. However, that is accompanied by 4008 + * KVM_REQ_MMU_RELOAD, which will free the root that we 4009 + * have set here and allocate a new one. 4010 + */ 4011 + 4012 + kvm_make_request(KVM_REQ_LOAD_CR3, vcpu); 4013 + if (!skip_tlb_flush) { 4014 + kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); 4015 + kvm_x86_ops->tlb_flush(vcpu, true); 4016 + } 4017 + 4018 + /* 4019 + * The last MMIO access's GVA and GPA are cached in the 4020 + * VCPU. When switching to a new CR3, that GVA->GPA 4021 + * mapping may no longer be valid. So clear any cached 4022 + * MMIO info even when we don't need to sync the shadow 4023 + * page tables. 4024 + */ 4025 + vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); 4026 + 4027 + __clear_sp_write_flooding_count( 4028 + page_header(mmu->root_hpa)); 4029 + 4030 + return true; 4031 + } 4032 + } 4033 + 4034 + return false; 4035 + } 4036 + 4037 + static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, 4038 + union kvm_mmu_page_role new_role, 4039 + bool skip_tlb_flush) 4040 + { 4041 + if (!fast_cr3_switch(vcpu, new_cr3, new_role, skip_tlb_flush)) 4042 + kvm_mmu_free_roots(vcpu, KVM_MMU_ROOT_CURRENT); 4043 + } 4044 + 4045 + void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush) 4046 + { 4047 + __kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu), 4048 + skip_tlb_flush); 4049 + } 4050 + EXPORT_SYMBOL_GPL(kvm_mmu_new_cr3); 4111 4051 4112 4052 static unsigned long get_cr3(struct kvm_vcpu *vcpu) 4113 4053 { ··· 4674 4432 context->invlpg = paging64_invlpg; 4675 4433 context->update_pte = paging64_update_pte; 4676 4434 context->shadow_root_level = level; 4677 - context->root_hpa = INVALID_PAGE; 4678 4435 context->direct_map = false; 4679 4436 } 4680 4437 ··· 4703 4462 context->invlpg = paging32_invlpg; 4704 4463 context->update_pte = paging32_update_pte; 4705 4464 context->shadow_root_level = PT32E_ROOT_LEVEL; 4706 - context->root_hpa = INVALID_PAGE; 4707 4465 context->direct_map = false; 4708 4466 } 4709 4467 ··· 4712 4472 paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL); 4713 4473 } 4714 4474 4475 + static union kvm_mmu_page_role 4476 + kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu) 4477 + { 4478 + union kvm_mmu_page_role role = {0}; 4479 + 4480 + role.guest_mode = is_guest_mode(vcpu); 4481 + role.smm = is_smm(vcpu); 4482 + role.ad_disabled = (shadow_accessed_mask == 0); 4483 + role.level = kvm_x86_ops->get_tdp_level(vcpu); 4484 + role.direct = true; 4485 + role.access = ACC_ALL; 4486 + 4487 + return role; 4488 + } 4489 + 4715 4490 static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) 4716 4491 { 4717 4492 struct kvm_mmu *context = &vcpu->arch.mmu; 4718 4493 4719 - context->base_role.word = 0; 4720 - context->base_role.guest_mode = is_guest_mode(vcpu); 4721 - context->base_role.smm = is_smm(vcpu); 4722 - context->base_role.ad_disabled = (shadow_accessed_mask == 0); 4494 + context->base_role.word = mmu_base_role_mask.word & 4495 + kvm_calc_tdp_mmu_root_page_role(vcpu).word; 4723 4496 context->page_fault = tdp_page_fault; 4724 4497 context->sync_page = nonpaging_sync_page; 4725 4498 context->invlpg = nonpaging_invlpg; 4726 4499 context->update_pte = nonpaging_update_pte; 4727 4500 context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu); 4728 - context->root_hpa = INVALID_PAGE; 4729 4501 context->direct_map = true; 4730 4502 context->set_cr3 = kvm_x86_ops->set_tdp_cr3; 4731 4503 context->get_cr3 = get_cr3; ··· 4772 4520 reset_tdp_shadow_zero_bits_mask(vcpu, context); 4773 4521 } 4774 4522 4775 - void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) 4523 + static union kvm_mmu_page_role 4524 + kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu) 4776 4525 { 4526 + union kvm_mmu_page_role role = {0}; 4777 4527 bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); 4778 4528 bool smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP); 4779 - struct kvm_mmu *context = &vcpu->arch.mmu; 4780 4529 4781 - MMU_WARN_ON(VALID_PAGE(context->root_hpa)); 4530 + role.nxe = is_nx(vcpu); 4531 + role.cr4_pae = !!is_pae(vcpu); 4532 + role.cr0_wp = is_write_protection(vcpu); 4533 + role.smep_andnot_wp = smep && !is_write_protection(vcpu); 4534 + role.smap_andnot_wp = smap && !is_write_protection(vcpu); 4535 + role.guest_mode = is_guest_mode(vcpu); 4536 + role.smm = is_smm(vcpu); 4537 + role.direct = !is_paging(vcpu); 4538 + role.access = ACC_ALL; 4539 + 4540 + if (!is_long_mode(vcpu)) 4541 + role.level = PT32E_ROOT_LEVEL; 4542 + else if (is_la57_mode(vcpu)) 4543 + role.level = PT64_ROOT_5LEVEL; 4544 + else 4545 + role.level = PT64_ROOT_4LEVEL; 4546 + 4547 + return role; 4548 + } 4549 + 4550 + void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) 4551 + { 4552 + struct kvm_mmu *context = &vcpu->arch.mmu; 4782 4553 4783 4554 if (!is_paging(vcpu)) 4784 4555 nonpaging_init_context(vcpu, context); ··· 4812 4537 else 4813 4538 paging32_init_context(vcpu, context); 4814 4539 4815 - context->base_role.nxe = is_nx(vcpu); 4816 - context->base_role.cr4_pae = !!is_pae(vcpu); 4817 - context->base_role.cr0_wp = is_write_protection(vcpu); 4818 - context->base_role.smep_andnot_wp 4819 - = smep && !is_write_protection(vcpu); 4820 - context->base_role.smap_andnot_wp 4821 - = smap && !is_write_protection(vcpu); 4822 - context->base_role.guest_mode = is_guest_mode(vcpu); 4823 - context->base_role.smm = is_smm(vcpu); 4540 + context->base_role.word = mmu_base_role_mask.word & 4541 + kvm_calc_shadow_mmu_root_page_role(vcpu).word; 4824 4542 reset_shadow_zero_bits_mask(vcpu, context); 4825 4543 } 4826 4544 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); 4827 4545 4546 + static union kvm_mmu_page_role 4547 + kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty) 4548 + { 4549 + union kvm_mmu_page_role role = vcpu->arch.mmu.base_role; 4550 + 4551 + role.level = PT64_ROOT_4LEVEL; 4552 + role.direct = false; 4553 + role.ad_disabled = !accessed_dirty; 4554 + role.guest_mode = true; 4555 + role.access = ACC_ALL; 4556 + 4557 + return role; 4558 + } 4559 + 4828 4560 void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, 4829 - bool accessed_dirty) 4561 + bool accessed_dirty, gpa_t new_eptp) 4830 4562 { 4831 4563 struct kvm_mmu *context = &vcpu->arch.mmu; 4564 + union kvm_mmu_page_role root_page_role = 4565 + kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty); 4832 4566 4833 - MMU_WARN_ON(VALID_PAGE(context->root_hpa)); 4834 - 4567 + __kvm_mmu_new_cr3(vcpu, new_eptp, root_page_role, false); 4835 4568 context->shadow_root_level = PT64_ROOT_4LEVEL; 4836 4569 4837 4570 context->nx = true; ··· 4850 4567 context->invlpg = ept_invlpg; 4851 4568 context->update_pte = ept_update_pte; 4852 4569 context->root_level = PT64_ROOT_4LEVEL; 4853 - context->root_hpa = INVALID_PAGE; 4854 4570 context->direct_map = false; 4855 - context->base_role.ad_disabled = !accessed_dirty; 4856 - context->base_role.guest_mode = 1; 4571 + context->base_role.word = root_page_role.word & mmu_base_role_mask.word; 4857 4572 update_permission_bitmask(vcpu, context, true); 4858 4573 update_pkru_bitmask(vcpu, context, true); 4859 4574 update_last_nonleaf_level(vcpu, context); ··· 4914 4633 update_last_nonleaf_level(vcpu, g_context); 4915 4634 } 4916 4635 4917 - static void init_kvm_mmu(struct kvm_vcpu *vcpu) 4636 + void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots) 4918 4637 { 4638 + if (reset_roots) { 4639 + uint i; 4640 + 4641 + vcpu->arch.mmu.root_hpa = INVALID_PAGE; 4642 + 4643 + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) 4644 + vcpu->arch.mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; 4645 + } 4646 + 4919 4647 if (mmu_is_nested(vcpu)) 4920 4648 init_kvm_nested_mmu(vcpu); 4921 4649 else if (tdp_enabled) ··· 4932 4642 else 4933 4643 init_kvm_softmmu(vcpu); 4934 4644 } 4645 + EXPORT_SYMBOL_GPL(kvm_init_mmu); 4646 + 4647 + static union kvm_mmu_page_role 4648 + kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu) 4649 + { 4650 + if (tdp_enabled) 4651 + return kvm_calc_tdp_mmu_root_page_role(vcpu); 4652 + else 4653 + return kvm_calc_shadow_mmu_root_page_role(vcpu); 4654 + } 4935 4655 4936 4656 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) 4937 4657 { 4938 4658 kvm_mmu_unload(vcpu); 4939 - init_kvm_mmu(vcpu); 4659 + kvm_init_mmu(vcpu, true); 4940 4660 } 4941 4661 EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); 4942 4662 ··· 4961 4661 kvm_mmu_sync_roots(vcpu); 4962 4662 if (r) 4963 4663 goto out; 4964 - /* set_cr3() should ensure TLB has been flushed */ 4965 - vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa); 4664 + kvm_mmu_load_cr3(vcpu); 4665 + kvm_x86_ops->tlb_flush(vcpu, true); 4966 4666 out: 4967 4667 return r; 4968 4668 } ··· 4970 4670 4971 4671 void kvm_mmu_unload(struct kvm_vcpu *vcpu) 4972 4672 { 4973 - kvm_mmu_free_roots(vcpu); 4673 + kvm_mmu_free_roots(vcpu, KVM_MMU_ROOTS_ALL); 4974 4674 WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa)); 4975 4675 } 4976 4676 EXPORT_SYMBOL_GPL(kvm_mmu_unload); ··· 5123 4823 u64 entry, gentry, *spte; 5124 4824 int npte; 5125 4825 bool remote_flush, local_flush; 5126 - union kvm_mmu_page_role mask = { }; 5127 - 5128 - mask.cr0_wp = 1; 5129 - mask.cr4_pae = 1; 5130 - mask.nxe = 1; 5131 - mask.smep_andnot_wp = 1; 5132 - mask.smap_andnot_wp = 1; 5133 - mask.smm = 1; 5134 - mask.guest_mode = 1; 5135 - mask.ad_disabled = 1; 5136 4826 5137 4827 /* 5138 4828 * If we don't have indirect shadow pages, it means no page is ··· 5166 4876 mmu_page_zap_pte(vcpu->kvm, sp, spte); 5167 4877 if (gentry && 5168 4878 !((sp->role.word ^ vcpu->arch.mmu.base_role.word) 5169 - & mask.word) && rmap_can_add(vcpu)) 4879 + & mmu_base_role_mask.word) && rmap_can_add(vcpu)) 5170 4880 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); 5171 4881 if (need_remote_flush(entry, *spte)) 5172 4882 remote_flush = true; ··· 5291 5001 5292 5002 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) 5293 5003 { 5294 - vcpu->arch.mmu.invlpg(vcpu, gva); 5295 - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 5004 + struct kvm_mmu *mmu = &vcpu->arch.mmu; 5005 + int i; 5006 + 5007 + /* INVLPG on a * non-canonical address is a NOP according to the SDM. */ 5008 + if (is_noncanonical_address(gva, vcpu)) 5009 + return; 5010 + 5011 + mmu->invlpg(vcpu, gva, mmu->root_hpa); 5012 + 5013 + /* 5014 + * INVLPG is required to invalidate any global mappings for the VA, 5015 + * irrespective of PCID. Since it would take us roughly similar amount 5016 + * of work to determine whether any of the prev_root mappings of the VA 5017 + * is marked global, or to just sync it blindly, so we might as well 5018 + * just always sync it. 5019 + * 5020 + * Mappings not reachable via the current cr3 or the prev_roots will be 5021 + * synced when switching to that cr3, so nothing needs to be done here 5022 + * for them. 5023 + */ 5024 + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) 5025 + if (VALID_PAGE(mmu->prev_roots[i].hpa)) 5026 + mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); 5027 + 5028 + kvm_x86_ops->tlb_flush_gva(vcpu, gva); 5296 5029 ++vcpu->stat.invlpg; 5297 5030 } 5298 5031 EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); 5032 + 5033 + void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) 5034 + { 5035 + struct kvm_mmu *mmu = &vcpu->arch.mmu; 5036 + bool tlb_flush = false; 5037 + uint i; 5038 + 5039 + if (pcid == kvm_get_active_pcid(vcpu)) { 5040 + mmu->invlpg(vcpu, gva, mmu->root_hpa); 5041 + tlb_flush = true; 5042 + } 5043 + 5044 + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 5045 + if (VALID_PAGE(mmu->prev_roots[i].hpa) && 5046 + pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].cr3)) { 5047 + mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); 5048 + tlb_flush = true; 5049 + } 5050 + } 5051 + 5052 + if (tlb_flush) 5053 + kvm_x86_ops->tlb_flush_gva(vcpu, gva); 5054 + 5055 + ++vcpu->stat.invlpg; 5056 + 5057 + /* 5058 + * Mappings not reachable via the current cr3 or the prev_roots will be 5059 + * synced when switching to that cr3, so nothing needs to be done here 5060 + * for them. 5061 + */ 5062 + } 5063 + EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva); 5299 5064 5300 5065 void kvm_enable_tdp(void) 5301 5066 { ··· 5375 5030 struct page *page; 5376 5031 int i; 5377 5032 5033 + if (tdp_enabled) 5034 + return 0; 5035 + 5378 5036 /* 5379 5037 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. 5380 5038 * Therefore we need to allocate shadow page tables in the first ··· 5396 5048 5397 5049 int kvm_mmu_create(struct kvm_vcpu *vcpu) 5398 5050 { 5051 + uint i; 5052 + 5399 5053 vcpu->arch.walk_mmu = &vcpu->arch.mmu; 5400 5054 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 5401 5055 vcpu->arch.mmu.translate_gpa = translate_gpa; 5402 5056 vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; 5057 + 5058 + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) 5059 + vcpu->arch.mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; 5403 5060 5404 5061 return alloc_mmu_pages(vcpu); 5405 5062 } ··· 5413 5060 { 5414 5061 MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa)); 5415 5062 5416 - init_kvm_mmu(vcpu); 5063 + kvm_init_mmu(vcpu, true); 5417 5064 } 5418 5065 5419 5066 static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, ··· 5853 5500 { 5854 5501 int ret = -ENOMEM; 5855 5502 5856 - kvm_mmu_clear_all_pte_masks(); 5503 + kvm_mmu_reset_all_pte_masks(); 5857 5504 5858 5505 pte_list_desc_cache = kmem_cache_create("pte_list_desc", 5859 5506 sizeof(struct pte_list_desc),

+23 -1

arch/x86/kvm/mmu.h

··· 61 61 void 62 62 reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); 63 63 64 + void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots); 64 65 void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu); 65 66 void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, 66 - bool accessed_dirty); 67 + bool accessed_dirty, gpa_t new_eptp); 67 68 bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); 68 69 int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, 69 70 u64 fault_address, char *insn, int insn_len); ··· 84 83 return 0; 85 84 86 85 return kvm_mmu_load(vcpu); 86 + } 87 + 88 + static inline unsigned long kvm_get_pcid(struct kvm_vcpu *vcpu, gpa_t cr3) 89 + { 90 + BUILD_BUG_ON((X86_CR3_PCID_MASK & PAGE_MASK) != 0); 91 + 92 + return kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE) 93 + ? cr3 & X86_CR3_PCID_MASK 94 + : 0; 95 + } 96 + 97 + static inline unsigned long kvm_get_active_pcid(struct kvm_vcpu *vcpu) 98 + { 99 + return kvm_get_pcid(vcpu, kvm_read_cr3(vcpu)); 100 + } 101 + 102 + static inline void kvm_mmu_load_cr3(struct kvm_vcpu *vcpu) 103 + { 104 + if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) 105 + vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa | 106 + kvm_get_active_pcid(vcpu)); 87 107 } 88 108 89 109 /*

+16 -12

arch/x86/kvm/paging_tmpl.h

··· 181 181 * set bit 0 if execute only is supported. Here, we repurpose ACC_USER_MASK 182 182 * to signify readability since it isn't used in the EPT case 183 183 */ 184 - static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte) 184 + static inline unsigned FNAME(gpte_access)(u64 gpte) 185 185 { 186 186 unsigned access; 187 187 #if PTTYPE == PTTYPE_EPT ··· 394 394 accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0; 395 395 396 396 /* Convert to ACC_*_MASK flags for struct guest_walker. */ 397 - walker->pt_access = FNAME(gpte_access)(vcpu, pt_access ^ walk_nx_mask); 398 - walker->pte_access = FNAME(gpte_access)(vcpu, pte_access ^ walk_nx_mask); 397 + walker->pt_access = FNAME(gpte_access)(pt_access ^ walk_nx_mask); 398 + walker->pte_access = FNAME(gpte_access)(pte_access ^ walk_nx_mask); 399 399 errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access); 400 400 if (unlikely(errcode)) 401 401 goto error; ··· 508 508 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 509 509 510 510 gfn = gpte_to_gfn(gpte); 511 - pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 511 + pte_access = sp->role.access & FNAME(gpte_access)(gpte); 512 512 FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte); 513 513 pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, 514 514 no_dirty_log && (pte_access & ACC_WRITE_MASK)); ··· 856 856 return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t); 857 857 } 858 858 859 - static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) 859 + static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) 860 860 { 861 861 struct kvm_shadow_walk_iterator iterator; 862 862 struct kvm_mmu_page *sp; ··· 871 871 */ 872 872 mmu_topup_memory_caches(vcpu); 873 873 874 - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) { 874 + if (!VALID_PAGE(root_hpa)) { 875 875 WARN_ON(1); 876 876 return; 877 877 } 878 878 879 879 spin_lock(&vcpu->kvm->mmu_lock); 880 - for_each_shadow_entry(vcpu, gva, iterator) { 880 + for_each_shadow_entry_using_root(vcpu, root_hpa, gva, iterator) { 881 881 level = iterator.level; 882 882 sptep = iterator.sptep; 883 883 ··· 968 968 int i, nr_present = 0; 969 969 bool host_writable; 970 970 gpa_t first_pte_gpa; 971 + int set_spte_ret = 0; 971 972 972 973 /* direct kvm_mmu_page can not be unsync. */ 973 974 BUG_ON(sp->role.direct); ··· 1003 1002 1004 1003 gfn = gpte_to_gfn(gpte); 1005 1004 pte_access = sp->role.access; 1006 - pte_access &= FNAME(gpte_access)(vcpu, gpte); 1005 + pte_access &= FNAME(gpte_access)(gpte); 1007 1006 FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte); 1008 1007 1009 1008 if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access, ··· 1025 1024 1026 1025 host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE; 1027 1026 1028 - set_spte(vcpu, &sp->spt[i], pte_access, 1029 - PT_PAGE_TABLE_LEVEL, gfn, 1030 - spte_to_pfn(sp->spt[i]), true, false, 1031 - host_writable); 1027 + set_spte_ret |= set_spte(vcpu, &sp->spt[i], 1028 + pte_access, PT_PAGE_TABLE_LEVEL, 1029 + gfn, spte_to_pfn(sp->spt[i]), 1030 + true, false, host_writable); 1032 1031 } 1032 + 1033 + if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH) 1034 + kvm_flush_remote_tlbs(vcpu->kvm); 1033 1035 1034 1036 return nr_present; 1035 1037 }

+8 -4

arch/x86/kvm/svm.c

··· 2884 2884 2885 2885 svm->vmcb->control.nested_cr3 = __sme_set(root); 2886 2886 mark_dirty(svm->vmcb, VMCB_NPT); 2887 - svm_flush_tlb(vcpu, true); 2888 2887 } 2889 2888 2890 2889 static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, ··· 5434 5435 svm->asid_generation--; 5435 5436 } 5436 5437 5438 + static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva) 5439 + { 5440 + struct vcpu_svm *svm = to_svm(vcpu); 5441 + 5442 + invlpga(gva, svm->vmcb->control.asid); 5443 + } 5444 + 5437 5445 static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) 5438 5446 { 5439 5447 } ··· 5772 5766 5773 5767 svm->vmcb->save.cr3 = __sme_set(root); 5774 5768 mark_dirty(svm->vmcb, VMCB_CR); 5775 - svm_flush_tlb(vcpu, true); 5776 5769 } 5777 5770 5778 5771 static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) ··· 5784 5779 /* Also sync guest cr3 here in case we live migrate */ 5785 5780 svm->vmcb->save.cr3 = kvm_read_cr3(vcpu); 5786 5781 mark_dirty(svm->vmcb, VMCB_CR); 5787 - 5788 - svm_flush_tlb(vcpu, true); 5789 5782 } 5790 5783 5791 5784 static int is_disabled(void) ··· 7093 7090 .set_rflags = svm_set_rflags, 7094 7091 7095 7092 .tlb_flush = svm_flush_tlb, 7093 + .tlb_flush_gva = svm_flush_tlb_gva, 7096 7094 7097 7095 .run = svm_vcpu_run, 7098 7096 .handle_exit = handle_exit,

+913 -211

arch/x86/kvm/vmx.c

··· 38 38 #include "kvm_cache_regs.h" 39 39 #include "x86.h" 40 40 41 + #include <asm/asm.h> 41 42 #include <asm/cpu.h> 42 43 #include <asm/io.h> 43 44 #include <asm/desc.h> ··· 333 332 }; 334 333 module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); 335 334 335 + enum ept_pointers_status { 336 + EPT_POINTERS_CHECK = 0, 337 + EPT_POINTERS_MATCH = 1, 338 + EPT_POINTERS_MISMATCH = 2 339 + }; 340 + 336 341 struct kvm_vmx { 337 342 struct kvm kvm; 338 343 339 344 unsigned int tss_addr; 340 345 bool ept_identity_pagetable_done; 341 346 gpa_t ept_identity_map_addr; 347 + 348 + enum ept_pointers_status ept_pointers_match; 349 + spinlock_t ept_pointer_lock; 342 350 }; 343 351 344 352 #define NR_AUTOLOAD_MSRS 8 345 353 354 + struct vmcs_hdr { 355 + u32 revision_id:31; 356 + u32 shadow_vmcs:1; 357 + }; 358 + 346 359 struct vmcs { 347 - u32 revision_id; 360 + struct vmcs_hdr hdr; 348 361 u32 abort; 349 362 char data[0]; 363 + }; 364 + 365 + /* 366 + * vmcs_host_state tracks registers that are loaded from the VMCS on VMEXIT 367 + * and whose values change infrequently, but are not constant. I.e. this is 368 + * used as a write-through cache of the corresponding VMCS fields. 369 + */ 370 + struct vmcs_host_state { 371 + unsigned long cr3; /* May not match real cr3 */ 372 + unsigned long cr4; /* May not match real cr4 */ 373 + unsigned long gs_base; 374 + unsigned long fs_base; 375 + 376 + u16 fs_sel, gs_sel, ldt_sel; 377 + #ifdef CONFIG_X86_64 378 + u16 ds_sel, es_sel; 379 + #endif 350 380 }; 351 381 352 382 /* ··· 391 359 int cpu; 392 360 bool launched; 393 361 bool nmi_known_unmasked; 394 - unsigned long vmcs_host_cr3; /* May not match real cr3 */ 395 - unsigned long vmcs_host_cr4; /* May not match real cr4 */ 396 362 /* Support for vnmi-less CPUs */ 397 363 int soft_vnmi_blocked; 398 364 ktime_t entry_time; 399 365 s64 vnmi_blocked_time; 400 366 unsigned long *msr_bitmap; 401 367 struct list_head loaded_vmcss_on_cpu_link; 368 + struct vmcs_host_state host_state; 402 369 }; 403 370 404 371 struct shared_msr_entry { ··· 428 397 /* According to the Intel spec, a VMCS region must start with the 429 398 * following two fields. Then follow implementation-specific data. 430 399 */ 431 - u32 revision_id; 400 + struct vmcs_hdr hdr; 432 401 u32 abort; 433 402 434 403 u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ ··· 596 565 "Offset of " #field " in struct vmcs12 has changed.") 597 566 598 567 static inline void vmx_check_vmcs12_offsets(void) { 599 - CHECK_OFFSET(revision_id, 0); 568 + CHECK_OFFSET(hdr, 0); 600 569 CHECK_OFFSET(abort, 4); 601 570 CHECK_OFFSET(launch_state, 8); 602 571 CHECK_OFFSET(io_bitmap_a, 40); ··· 815 784 */ 816 785 struct vmcs12 *cached_vmcs12; 817 786 /* 787 + * Cache of the guest's shadow VMCS, existing outside of guest 788 + * memory. Loaded from guest memory during VM entry. Flushed 789 + * to guest memory during VM exit. 790 + */ 791 + struct vmcs12 *cached_shadow_vmcs12; 792 + /* 818 793 * Indicates if the shadow vmcs must be updated with the 819 794 * data hold by vmcs12 820 795 */ ··· 970 933 /* 971 934 * loaded_vmcs points to the VMCS currently used in this vcpu. For a 972 935 * non-nested (L1) guest, it always points to vmcs01. For a nested 973 - * guest (L2), it points to a different VMCS. 936 + * guest (L2), it points to a different VMCS. loaded_cpu_state points 937 + * to the VMCS whose state is loaded into the CPU registers that only 938 + * need to be switched when transitioning to/from the kernel; a NULL 939 + * value indicates that host state is loaded. 974 940 */ 975 941 struct loaded_vmcs vmcs01; 976 942 struct loaded_vmcs *loaded_vmcs; 943 + struct loaded_vmcs *loaded_cpu_state; 977 944 bool __launched; /* temporary, used in vmx_vcpu_run */ 978 945 struct msr_autoload { 979 946 struct vmx_msrs guest; 980 947 struct vmx_msrs host; 981 948 } msr_autoload; 982 - struct { 983 - int loaded; 984 - u16 fs_sel, gs_sel, ldt_sel; 985 - #ifdef CONFIG_X86_64 986 - u16 ds_sel, es_sel; 987 - #endif 988 - int gs_ldt_reload_needed; 989 - int fs_reload_needed; 990 - u64 msr_host_bndcfgs; 991 - } host_state; 949 + 992 950 struct { 993 951 int vm86_active; 994 952 ulong save_rflags; ··· 1033 1001 */ 1034 1002 u64 msr_ia32_feature_control; 1035 1003 u64 msr_ia32_feature_control_valid_bits; 1004 + u64 ept_pointer; 1036 1005 }; 1037 1006 1038 1007 enum segment_cache_field { ··· 1251 1218 static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) 1252 1219 { 1253 1220 return to_vmx(vcpu)->nested.cached_vmcs12; 1221 + } 1222 + 1223 + static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu) 1224 + { 1225 + return to_vmx(vcpu)->nested.cached_shadow_vmcs12; 1254 1226 } 1255 1227 1256 1228 static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu); ··· 1527 1489 * Currently unsupported in KVM: 1528 1490 * GUEST_IA32_RTIT_CTL = 0x00002814, 1529 1491 */ 1492 + } 1493 + 1494 + /* check_ept_pointer() should be under protection of ept_pointer_lock. */ 1495 + static void check_ept_pointer_match(struct kvm *kvm) 1496 + { 1497 + struct kvm_vcpu *vcpu; 1498 + u64 tmp_eptp = INVALID_PAGE; 1499 + int i; 1500 + 1501 + kvm_for_each_vcpu(i, vcpu, kvm) { 1502 + if (!VALID_PAGE(tmp_eptp)) { 1503 + tmp_eptp = to_vmx(vcpu)->ept_pointer; 1504 + } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) { 1505 + to_kvm_vmx(kvm)->ept_pointers_match 1506 + = EPT_POINTERS_MISMATCH; 1507 + return; 1508 + } 1509 + } 1510 + 1511 + to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH; 1512 + } 1513 + 1514 + static int vmx_hv_remote_flush_tlb(struct kvm *kvm) 1515 + { 1516 + int ret; 1517 + 1518 + spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock); 1519 + 1520 + if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK) 1521 + check_ept_pointer_match(kvm); 1522 + 1523 + if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) { 1524 + ret = -ENOTSUPP; 1525 + goto out; 1526 + } 1527 + 1528 + ret = hyperv_flush_guest_mapping( 1529 + to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer); 1530 + 1531 + out: 1532 + spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); 1533 + return ret; 1530 1534 } 1531 1535 #else /* !IS_ENABLED(CONFIG_HYPERV) */ 1532 1536 static inline void evmcs_write64(unsigned long field, u64 value) {} ··· 1944 1864 CPU_BASED_MONITOR_TRAP_FLAG; 1945 1865 } 1946 1866 1867 + static inline bool nested_cpu_has_vmx_shadow_vmcs(struct kvm_vcpu *vcpu) 1868 + { 1869 + return to_vmx(vcpu)->nested.msrs.secondary_ctls_high & 1870 + SECONDARY_EXEC_SHADOW_VMCS; 1871 + } 1872 + 1947 1873 static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit) 1948 1874 { 1949 1875 return vmcs12->cpu_based_vm_exec_control & bit; ··· 2030 1944 VMX_VMFUNC_EPTP_SWITCHING); 2031 1945 } 2032 1946 1947 + static inline bool nested_cpu_has_shadow_vmcs(struct vmcs12 *vmcs12) 1948 + { 1949 + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS); 1950 + } 1951 + 2033 1952 static inline bool is_nmi(u32 intr_info) 2034 1953 { 2035 1954 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) ··· 2065 1974 u64 rsvd : 48; 2066 1975 u64 gva; 2067 1976 } operand = { vpid, 0, gva }; 1977 + bool error; 2068 1978 2069 - asm volatile (__ex(ASM_VMX_INVVPID) 2070 - /* CF==1 or ZF==1 --> rc = -1 */ 2071 - "; ja 1f ; ud2 ; 1:" 2072 - : : "a"(&operand), "c"(ext) : "cc", "memory"); 1979 + asm volatile (__ex(ASM_VMX_INVVPID) CC_SET(na) 1980 + : CC_OUT(na) (error) : "a"(&operand), "c"(ext) 1981 + : "memory"); 1982 + BUG_ON(error); 2073 1983 } 2074 1984 2075 1985 static inline void __invept(int ext, u64 eptp, gpa_t gpa) ··· 2078 1986 struct { 2079 1987 u64 eptp, gpa; 2080 1988 } operand = {eptp, gpa}; 1989 + bool error; 2081 1990 2082 - asm volatile (__ex(ASM_VMX_INVEPT) 2083 - /* CF==1 or ZF==1 --> rc = -1 */ 2084 - "; ja 1f ; ud2 ; 1:\n" 2085 - : : "a" (&operand), "c" (ext) : "cc", "memory"); 1991 + asm volatile (__ex(ASM_VMX_INVEPT) CC_SET(na) 1992 + : CC_OUT(na) (error) : "a" (&operand), "c" (ext) 1993 + : "memory"); 1994 + BUG_ON(error); 2086 1995 } 2087 1996 2088 1997 static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) ··· 2099 2006 static void vmcs_clear(struct vmcs *vmcs) 2100 2007 { 2101 2008 u64 phys_addr = __pa(vmcs); 2102 - u8 error; 2009 + bool error; 2103 2010 2104 - asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0" 2105 - : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) 2106 - : "cc", "memory"); 2107 - if (error) 2011 + asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) CC_SET(na) 2012 + : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr) 2013 + : "memory"); 2014 + if (unlikely(error)) 2108 2015 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", 2109 2016 vmcs, phys_addr); 2110 2017 } ··· 2121 2028 static void vmcs_load(struct vmcs *vmcs) 2122 2029 { 2123 2030 u64 phys_addr = __pa(vmcs); 2124 - u8 error; 2031 + bool error; 2125 2032 2126 2033 if (static_branch_unlikely(&enable_evmcs)) 2127 2034 return evmcs_load(phys_addr); 2128 2035 2129 - asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" 2130 - : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) 2131 - : "cc", "memory"); 2132 - if (error) 2036 + asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) CC_SET(na) 2037 + : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr) 2038 + : "memory"); 2039 + if (unlikely(error)) 2133 2040 printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n", 2134 2041 vmcs, phys_addr); 2135 2042 } ··· 2205 2112 if (cpu != -1) 2206 2113 smp_call_function_single(cpu, 2207 2114 __loaded_vmcs_clear, loaded_vmcs, 1); 2115 + } 2116 + 2117 + static inline bool vpid_sync_vcpu_addr(int vpid, gva_t addr) 2118 + { 2119 + if (vpid == 0) 2120 + return true; 2121 + 2122 + if (cpu_has_vmx_invvpid_individual_addr()) { 2123 + __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr); 2124 + return true; 2125 + } 2126 + 2127 + return false; 2208 2128 } 2209 2129 2210 2130 static inline void vpid_sync_vcpu_single(int vpid) ··· 2354 2248 2355 2249 static __always_inline void __vmcs_writel(unsigned long field, unsigned long value) 2356 2250 { 2357 - u8 error; 2251 + bool error; 2358 2252 2359 - asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0" 2360 - : "=q"(error) : "a"(value), "d"(field) : "cc"); 2253 + asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) CC_SET(na) 2254 + : CC_OUT(na) (error) : "a"(value), "d"(field)); 2361 2255 if (unlikely(error)) 2362 2256 vmwrite_error(field, value); 2363 2257 } ··· 2841 2735 } 2842 2736 #endif 2843 2737 2844 - static void vmx_save_host_state(struct kvm_vcpu *vcpu) 2738 + static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) 2845 2739 { 2846 2740 struct vcpu_vmx *vmx = to_vmx(vcpu); 2741 + struct vmcs_host_state *host_state; 2847 2742 #ifdef CONFIG_X86_64 2848 2743 int cpu = raw_smp_processor_id(); 2849 - unsigned long fs_base, kernel_gs_base; 2850 2744 #endif 2745 + unsigned long fs_base, gs_base; 2746 + u16 fs_sel, gs_sel; 2851 2747 int i; 2852 2748 2853 - if (vmx->host_state.loaded) 2749 + if (vmx->loaded_cpu_state) 2854 2750 return; 2855 2751 2856 - vmx->host_state.loaded = 1; 2752 + vmx->loaded_cpu_state = vmx->loaded_vmcs; 2753 + host_state = &vmx->loaded_cpu_state->host_state; 2754 + 2857 2755 /* 2858 2756 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not 2859 2757 * allow segment selectors with cpl > 0 or ti == 1. 2860 2758 */ 2861 - vmx->host_state.ldt_sel = kvm_read_ldt(); 2862 - vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; 2759 + host_state->ldt_sel = kvm_read_ldt(); 2863 2760 2864 2761 #ifdef CONFIG_X86_64 2762 + savesegment(ds, host_state->ds_sel); 2763 + savesegment(es, host_state->es_sel); 2764 + 2765 + gs_base = cpu_kernelmode_gs_base(cpu); 2865 2766 if (likely(is_64bit_mm(current->mm))) { 2866 2767 save_fsgs_for_kvm(); 2867 - vmx->host_state.fs_sel = current->thread.fsindex; 2868 - vmx->host_state.gs_sel = current->thread.gsindex; 2768 + fs_sel = current->thread.fsindex; 2769 + gs_sel = current->thread.gsindex; 2869 2770 fs_base = current->thread.fsbase; 2870 - kernel_gs_base = current->thread.gsbase; 2771 + vmx->msr_host_kernel_gs_base = current->thread.gsbase; 2871 2772 } else { 2872 - #endif 2873 - savesegment(fs, vmx->host_state.fs_sel); 2874 - savesegment(gs, vmx->host_state.gs_sel); 2875 - #ifdef CONFIG_X86_64 2773 + savesegment(fs, fs_sel); 2774 + savesegment(gs, gs_sel); 2876 2775 fs_base = read_msr(MSR_FS_BASE); 2877 - kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); 2878 - } 2879 - #endif 2880 - if (!(vmx->host_state.fs_sel & 7)) { 2881 - vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); 2882 - vmx->host_state.fs_reload_needed = 0; 2883 - } else { 2884 - vmcs_write16(HOST_FS_SELECTOR, 0); 2885 - vmx->host_state.fs_reload_needed = 1; 2886 - } 2887 - if (!(vmx->host_state.gs_sel & 7)) 2888 - vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); 2889 - else { 2890 - vmcs_write16(HOST_GS_SELECTOR, 0); 2891 - vmx->host_state.gs_ldt_reload_needed = 1; 2776 + vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); 2892 2777 } 2893 2778 2894 - #ifdef CONFIG_X86_64 2895 - savesegment(ds, vmx->host_state.ds_sel); 2896 - savesegment(es, vmx->host_state.es_sel); 2897 - 2898 - vmcs_writel(HOST_FS_BASE, fs_base); 2899 - vmcs_writel(HOST_GS_BASE, cpu_kernelmode_gs_base(cpu)); 2900 - 2901 - vmx->msr_host_kernel_gs_base = kernel_gs_base; 2902 2779 if (is_long_mode(&vmx->vcpu)) 2903 2780 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 2904 2781 #else 2905 - vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel)); 2906 - vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel)); 2782 + savesegment(fs, fs_sel); 2783 + savesegment(gs, gs_sel); 2784 + fs_base = segment_base(fs_sel); 2785 + gs_base = segment_base(gs_sel); 2907 2786 #endif 2908 - if (boot_cpu_has(X86_FEATURE_MPX)) 2909 - rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); 2787 + 2788 + if (unlikely(fs_sel != host_state->fs_sel)) { 2789 + if (!(fs_sel & 7)) 2790 + vmcs_write16(HOST_FS_SELECTOR, fs_sel); 2791 + else 2792 + vmcs_write16(HOST_FS_SELECTOR, 0); 2793 + host_state->fs_sel = fs_sel; 2794 + } 2795 + if (unlikely(gs_sel != host_state->gs_sel)) { 2796 + if (!(gs_sel & 7)) 2797 + vmcs_write16(HOST_GS_SELECTOR, gs_sel); 2798 + else 2799 + vmcs_write16(HOST_GS_SELECTOR, 0); 2800 + host_state->gs_sel = gs_sel; 2801 + } 2802 + if (unlikely(fs_base != host_state->fs_base)) { 2803 + vmcs_writel(HOST_FS_BASE, fs_base); 2804 + host_state->fs_base = fs_base; 2805 + } 2806 + if (unlikely(gs_base != host_state->gs_base)) { 2807 + vmcs_writel(HOST_GS_BASE, gs_base); 2808 + host_state->gs_base = gs_base; 2809 + } 2810 + 2910 2811 for (i = 0; i < vmx->save_nmsrs; ++i) 2911 2812 kvm_set_shared_msr(vmx->guest_msrs[i].index, 2912 2813 vmx->guest_msrs[i].data, 2913 2814 vmx->guest_msrs[i].mask); 2914 2815 } 2915 2816 2916 - static void __vmx_load_host_state(struct vcpu_vmx *vmx) 2817 + static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) 2917 2818 { 2918 - if (!vmx->host_state.loaded) 2819 + struct vmcs_host_state *host_state; 2820 + 2821 + if (!vmx->loaded_cpu_state) 2919 2822 return; 2920 2823 2824 + WARN_ON_ONCE(vmx->loaded_cpu_state != vmx->loaded_vmcs); 2825 + host_state = &vmx->loaded_cpu_state->host_state; 2826 + 2921 2827 ++vmx->vcpu.stat.host_state_reload; 2922 - vmx->host_state.loaded = 0; 2828 + vmx->loaded_cpu_state = NULL; 2829 + 2923 2830 #ifdef CONFIG_X86_64 2924 2831 if (is_long_mode(&vmx->vcpu)) 2925 2832 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 2926 2833 #endif 2927 - if (vmx->host_state.gs_ldt_reload_needed) { 2928 - kvm_load_ldt(vmx->host_state.ldt_sel); 2834 + if (host_state->ldt_sel || (host_state->gs_sel & 7)) { 2835 + kvm_load_ldt(host_state->ldt_sel); 2929 2836 #ifdef CONFIG_X86_64 2930 - load_gs_index(vmx->host_state.gs_sel); 2837 + load_gs_index(host_state->gs_sel); 2931 2838 #else 2932 - loadsegment(gs, vmx->host_state.gs_sel); 2839 + loadsegment(gs, host_state->gs_sel); 2933 2840 #endif 2934 2841 } 2935 - if (vmx->host_state.fs_reload_needed) 2936 - loadsegment(fs, vmx->host_state.fs_sel); 2842 + if (host_state->fs_sel & 7) 2843 + loadsegment(fs, host_state->fs_sel); 2937 2844 #ifdef CONFIG_X86_64 2938 - if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) { 2939 - loadsegment(ds, vmx->host_state.ds_sel); 2940 - loadsegment(es, vmx->host_state.es_sel); 2845 + if (unlikely(host_state->ds_sel | host_state->es_sel)) { 2846 + loadsegment(ds, host_state->ds_sel); 2847 + loadsegment(es, host_state->es_sel); 2941 2848 } 2942 2849 #endif 2943 2850 invalidate_tss_limit(); 2944 2851 #ifdef CONFIG_X86_64 2945 2852 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); 2946 2853 #endif 2947 - if (vmx->host_state.msr_host_bndcfgs) 2948 - wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); 2949 2854 load_fixmap_gdt(raw_smp_processor_id()); 2950 2855 } 2951 2856 2952 - static void vmx_load_host_state(struct vcpu_vmx *vmx) 2857 + #ifdef CONFIG_X86_64 2858 + static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) 2953 2859 { 2954 - preempt_disable(); 2955 - __vmx_load_host_state(vmx); 2956 - preempt_enable(); 2860 + if (is_long_mode(&vmx->vcpu)) { 2861 + preempt_disable(); 2862 + if (vmx->loaded_cpu_state) 2863 + rdmsrl(MSR_KERNEL_GS_BASE, 2864 + vmx->msr_guest_kernel_gs_base); 2865 + preempt_enable(); 2866 + } 2867 + return vmx->msr_guest_kernel_gs_base; 2957 2868 } 2869 + 2870 + static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) 2871 + { 2872 + if (is_long_mode(&vmx->vcpu)) { 2873 + preempt_disable(); 2874 + if (vmx->loaded_cpu_state) 2875 + wrmsrl(MSR_KERNEL_GS_BASE, data); 2876 + preempt_enable(); 2877 + } 2878 + vmx->msr_guest_kernel_gs_base = data; 2879 + } 2880 + #endif 2958 2881 2959 2882 static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) 2960 2883 { ··· 3126 2991 { 3127 2992 vmx_vcpu_pi_put(vcpu); 3128 2993 3129 - __vmx_load_host_state(to_vmx(vcpu)); 2994 + vmx_prepare_switch_to_host(to_vmx(vcpu)); 3130 2995 } 3131 2996 3132 2997 static bool emulation_required(struct kvm_vcpu *vcpu) ··· 3347 3212 3348 3213 static bool vmx_invpcid_supported(void) 3349 3214 { 3350 - return cpu_has_vmx_invpcid() && enable_ept; 3215 + return cpu_has_vmx_invpcid(); 3351 3216 } 3352 3217 3353 3218 /* ··· 3590 3455 SECONDARY_EXEC_APIC_REGISTER_VIRT | 3591 3456 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 3592 3457 SECONDARY_EXEC_WBINVD_EXITING; 3458 + /* 3459 + * We can emulate "VMCS shadowing," even if the hardware 3460 + * doesn't support it. 3461 + */ 3462 + msrs->secondary_ctls_high |= 3463 + SECONDARY_EXEC_SHADOW_VMCS; 3593 3464 3594 3465 if (enable_ept) { 3595 3466 /* nested EPT: emulate EPT also to L1 */ ··· 4063 3922 msr_info->data = vmcs_readl(GUEST_GS_BASE); 4064 3923 break; 4065 3924 case MSR_KERNEL_GS_BASE: 4066 - vmx_load_host_state(vmx); 4067 - msr_info->data = vmx->msr_guest_kernel_gs_base; 3925 + msr_info->data = vmx_read_guest_kernel_gs_base(vmx); 4068 3926 break; 4069 3927 #endif 4070 3928 case MSR_EFER: ··· 4163 4023 vmcs_writel(GUEST_GS_BASE, data); 4164 4024 break; 4165 4025 case MSR_KERNEL_GS_BASE: 4166 - vmx_load_host_state(vmx); 4167 - vmx->msr_guest_kernel_gs_base = data; 4026 + vmx_write_guest_kernel_gs_base(vmx, data); 4168 4027 break; 4169 4028 #endif 4170 4029 case MSR_IA32_SYSENTER_CS: ··· 4698 4559 return 0; 4699 4560 } 4700 4561 4701 - static struct vmcs *alloc_vmcs_cpu(int cpu) 4562 + static struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu) 4702 4563 { 4703 4564 int node = cpu_to_node(cpu); 4704 4565 struct page *pages; ··· 4712 4573 4713 4574 /* KVM supports Enlightened VMCS v1 only */ 4714 4575 if (static_branch_unlikely(&enable_evmcs)) 4715 - vmcs->revision_id = KVM_EVMCS_VERSION; 4576 + vmcs->hdr.revision_id = KVM_EVMCS_VERSION; 4716 4577 else 4717 - vmcs->revision_id = vmcs_config.revision_id; 4578 + vmcs->hdr.revision_id = vmcs_config.revision_id; 4718 4579 4580 + if (shadow) 4581 + vmcs->hdr.shadow_vmcs = 1; 4719 4582 return vmcs; 4720 4583 } 4721 4584 ··· 4741 4600 WARN_ON(loaded_vmcs->shadow_vmcs != NULL); 4742 4601 } 4743 4602 4744 - static struct vmcs *alloc_vmcs(void) 4603 + static struct vmcs *alloc_vmcs(bool shadow) 4745 4604 { 4746 - return alloc_vmcs_cpu(raw_smp_processor_id()); 4605 + return alloc_vmcs_cpu(shadow, raw_smp_processor_id()); 4747 4606 } 4748 4607 4749 4608 static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) 4750 4609 { 4751 - loaded_vmcs->vmcs = alloc_vmcs(); 4610 + loaded_vmcs->vmcs = alloc_vmcs(false); 4752 4611 if (!loaded_vmcs->vmcs) 4753 4612 return -ENOMEM; 4754 4613 ··· 4770 4629 evmcs->hv_enlightenments_control.msr_bitmap = 1; 4771 4630 } 4772 4631 } 4632 + 4633 + memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state)); 4634 + 4773 4635 return 0; 4774 4636 4775 4637 out_vmcs: ··· 4882 4738 for_each_possible_cpu(cpu) { 4883 4739 struct vmcs *vmcs; 4884 4740 4885 - vmcs = alloc_vmcs_cpu(cpu); 4741 + vmcs = alloc_vmcs_cpu(false, cpu); 4886 4742 if (!vmcs) { 4887 4743 free_kvm_area(); 4888 4744 return -ENOMEM; ··· 4899 4755 * physical CPU. 4900 4756 */ 4901 4757 if (static_branch_unlikely(&enable_evmcs)) 4902 - vmcs->revision_id = vmcs_config.revision_id; 4758 + vmcs->hdr.revision_id = vmcs_config.revision_id; 4903 4759 4904 4760 per_cpu(vmxarea, cpu) = vmcs; 4905 4761 } ··· 5056 4912 return; 5057 4913 5058 4914 /* 5059 - * Force kernel_gs_base reloading before EFER changes, as control 5060 - * of this msr depends on is_long_mode(). 4915 + * MSR_KERNEL_GS_BASE is not intercepted when the guest is in 4916 + * 64-bit mode as a 64-bit kernel may frequently access the 4917 + * MSR. This means we need to manually save/restore the MSR 4918 + * when switching between guest and host state, but only if 4919 + * the guest is in 64-bit mode. Sync our cached value if the 4920 + * guest is transitioning to 32-bit mode and the CPU contains 4921 + * guest state, i.e. the cache is stale. 5061 4922 */ 5062 - vmx_load_host_state(to_vmx(vcpu)); 4923 + #ifdef CONFIG_X86_64 4924 + if (!(efer & EFER_LMA)) 4925 + (void)vmx_read_guest_kernel_gs_base(vmx); 4926 + #endif 5063 4927 vcpu->arch.efer = efer; 5064 4928 if (efer & EFER_LMA) { 5065 4929 vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); ··· 5122 4970 static void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) 5123 4971 { 5124 4972 __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa); 4973 + } 4974 + 4975 + static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) 4976 + { 4977 + int vpid = to_vmx(vcpu)->vpid; 4978 + 4979 + if (!vpid_sync_vcpu_addr(vpid, addr)) 4980 + vpid_sync_context(vpid); 4981 + 4982 + /* 4983 + * If VPIDs are not supported or enabled, then the above is a no-op. 4984 + * But we don't really need a TLB flush in that case anyway, because 4985 + * each VM entry/exit includes an implicit flush when VPID is 0. 4986 + */ 5125 4987 } 5126 4988 5127 4989 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) ··· 5319 5153 5320 5154 static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 5321 5155 { 5156 + struct kvm *kvm = vcpu->kvm; 5322 5157 unsigned long guest_cr3; 5323 5158 u64 eptp; 5324 5159 ··· 5327 5160 if (enable_ept) { 5328 5161 eptp = construct_eptp(vcpu, cr3); 5329 5162 vmcs_write64(EPT_POINTER, eptp); 5163 + 5164 + if (kvm_x86_ops->tlb_remote_flush) { 5165 + spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock); 5166 + to_vmx(vcpu)->ept_pointer = eptp; 5167 + to_kvm_vmx(kvm)->ept_pointers_match 5168 + = EPT_POINTERS_CHECK; 5169 + spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); 5170 + } 5171 + 5330 5172 if (enable_unrestricted_guest || is_paging(vcpu) || 5331 5173 is_guest_mode(vcpu)) 5332 5174 guest_cr3 = kvm_read_cr3(vcpu); 5333 5175 else 5334 - guest_cr3 = to_kvm_vmx(vcpu->kvm)->ept_identity_map_addr; 5176 + guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; 5335 5177 ept_load_pdptrs(vcpu); 5336 5178 } 5337 5179 5338 - vmx_flush_tlb(vcpu, true); 5339 5180 vmcs_writel(GUEST_CR3, guest_cr3); 5340 5181 } 5341 5182 ··· 6279 6104 */ 6280 6105 cr3 = __read_cr3(); 6281 6106 vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ 6282 - vmx->loaded_vmcs->vmcs_host_cr3 = cr3; 6107 + vmx->loaded_vmcs->host_state.cr3 = cr3; 6283 6108 6284 6109 /* Save the most likely value for this task's CR4 in the VMCS. */ 6285 6110 cr4 = cr4_read_shadow(); 6286 6111 vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ 6287 - vmx->loaded_vmcs->vmcs_host_cr4 = cr4; 6112 + vmx->loaded_vmcs->host_state.cr4 = cr4; 6288 6113 6289 6114 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ 6290 6115 #ifdef CONFIG_X86_64 6291 6116 /* 6292 6117 * Load null selectors, so we can avoid reloading them in 6293 - * __vmx_load_host_state(), in case userspace uses the null selectors 6294 - * too (the expected case). 6118 + * vmx_prepare_switch_to_host(), in case userspace uses 6119 + * the null selectors too (the expected case). 6295 6120 */ 6296 6121 vmcs_write16(HOST_DS_SELECTOR, 0); 6297 6122 vmcs_write16(HOST_ES_SELECTOR, 0); ··· 6416 6241 if (!enable_ept) { 6417 6242 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; 6418 6243 enable_unrestricted_guest = 0; 6419 - /* Enable INVPCID for non-ept guests may cause performance regression. */ 6420 - exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; 6421 6244 } 6422 6245 if (!enable_unrestricted_guest) 6423 6246 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; ··· 6544 6371 */ 6545 6372 static void vmx_vcpu_setup(struct vcpu_vmx *vmx) 6546 6373 { 6547 - #ifdef CONFIG_X86_64 6548 - unsigned long a; 6549 - #endif 6550 6374 int i; 6551 6375 6552 6376 if (enable_shadow_vmcs) { ··· 6598 6428 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ 6599 6429 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ 6600 6430 vmx_set_constant_host_state(vmx); 6601 - #ifdef CONFIG_X86_64 6602 - rdmsrl(MSR_FS_BASE, a); 6603 - vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ 6604 - rdmsrl(MSR_GS_BASE, a); 6605 - vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */ 6606 - #else 6607 6431 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ 6608 6432 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ 6609 - #endif 6610 6433 6611 6434 if (cpu_has_vmx_vmfunc()) 6612 6435 vmcs_write64(VM_FUNCTION_CONTROL, 0); ··· 7833 7670 7834 7671 static __init int hardware_setup(void) 7835 7672 { 7673 + unsigned long host_bndcfgs; 7836 7674 int r = -ENOMEM, i; 7837 7675 7838 7676 rdmsrl_safe(MSR_EFER, &host_efer); ··· 7857 7693 7858 7694 if (boot_cpu_has(X86_FEATURE_NX)) 7859 7695 kvm_enable_efer_bits(EFER_NX); 7696 + 7697 + if (boot_cpu_has(X86_FEATURE_MPX)) { 7698 + rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs); 7699 + WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost"); 7700 + } 7860 7701 7861 7702 if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || 7862 7703 !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) ··· 7899 7730 if (enable_ept && !cpu_has_vmx_ept_2m_page()) 7900 7731 kvm_disable_largepages(); 7901 7732 7733 + #if IS_ENABLED(CONFIG_HYPERV) 7734 + if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH 7735 + && enable_ept) 7736 + kvm_x86_ops->tlb_remote_flush = vmx_hv_remote_flush_tlb; 7737 + #endif 7738 + 7902 7739 if (!cpu_has_vmx_ple()) { 7903 7740 ple_gap = 0; 7904 7741 ple_window = 0; ··· 7930 7755 vmx_enable_tdp(); 7931 7756 else 7932 7757 kvm_disable_tdp(); 7758 + 7759 + if (!nested) { 7760 + kvm_x86_ops->get_nested_state = NULL; 7761 + kvm_x86_ops->set_nested_state = NULL; 7762 + } 7933 7763 7934 7764 /* 7935 7765 * Only enable PML when hardware supports PML feature, and both EPT ··· 8212 8032 return 0; 8213 8033 } 8214 8034 8035 + /* 8036 + * Allocate a shadow VMCS and associate it with the currently loaded 8037 + * VMCS, unless such a shadow VMCS already exists. The newly allocated 8038 + * VMCS is also VMCLEARed, so that it is ready for use. 8039 + */ 8040 + static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) 8041 + { 8042 + struct vcpu_vmx *vmx = to_vmx(vcpu); 8043 + struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; 8044 + 8045 + /* 8046 + * We should allocate a shadow vmcs for vmcs01 only when L1 8047 + * executes VMXON and free it when L1 executes VMXOFF. 8048 + * As it is invalid to execute VMXON twice, we shouldn't reach 8049 + * here when vmcs01 already have an allocated shadow vmcs. 8050 + */ 8051 + WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs); 8052 + 8053 + if (!loaded_vmcs->shadow_vmcs) { 8054 + loaded_vmcs->shadow_vmcs = alloc_vmcs(true); 8055 + if (loaded_vmcs->shadow_vmcs) 8056 + vmcs_clear(loaded_vmcs->shadow_vmcs); 8057 + } 8058 + return loaded_vmcs->shadow_vmcs; 8059 + } 8060 + 8215 8061 static int enter_vmx_operation(struct kvm_vcpu *vcpu) 8216 8062 { 8217 8063 struct vcpu_vmx *vmx = to_vmx(vcpu); 8218 - struct vmcs *shadow_vmcs; 8219 8064 int r; 8220 8065 8221 8066 r = alloc_loaded_vmcs(&vmx->nested.vmcs02); ··· 8251 8046 if (!vmx->nested.cached_vmcs12) 8252 8047 goto out_cached_vmcs12; 8253 8048 8254 - if (enable_shadow_vmcs) { 8255 - shadow_vmcs = alloc_vmcs(); 8256 - if (!shadow_vmcs) 8257 - goto out_shadow_vmcs; 8258 - /* mark vmcs as shadow */ 8259 - shadow_vmcs->revision_id |= (1u << 31); 8260 - /* init shadow vmcs */ 8261 - vmcs_clear(shadow_vmcs); 8262 - vmx->vmcs01.shadow_vmcs = shadow_vmcs; 8263 - } 8049 + vmx->nested.cached_shadow_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); 8050 + if (!vmx->nested.cached_shadow_vmcs12) 8051 + goto out_cached_shadow_vmcs12; 8052 + 8053 + if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) 8054 + goto out_shadow_vmcs; 8264 8055 8265 8056 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, 8266 8057 HRTIMER_MODE_REL_PINNED); ··· 8268 8067 return 0; 8269 8068 8270 8069 out_shadow_vmcs: 8070 + kfree(vmx->nested.cached_shadow_vmcs12); 8071 + 8072 + out_cached_shadow_vmcs12: 8271 8073 kfree(vmx->nested.cached_vmcs12); 8272 8074 8273 8075 out_cached_vmcs12: ··· 8313 8109 8314 8110 /* CPL=0 must be checked manually. */ 8315 8111 if (vmx_get_cpl(vcpu)) { 8316 - kvm_queue_exception(vcpu, UD_VECTOR); 8112 + kvm_inject_gp(vcpu, 0); 8317 8113 return 1; 8318 8114 } 8319 8115 ··· 8376 8172 */ 8377 8173 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) 8378 8174 { 8379 - if (vmx_get_cpl(vcpu)) { 8380 - kvm_queue_exception(vcpu, UD_VECTOR); 8381 - return 0; 8382 - } 8383 - 8384 8175 if (!to_vmx(vcpu)->nested.vmxon) { 8385 8176 kvm_queue_exception(vcpu, UD_VECTOR); 8386 8177 return 0; 8387 8178 } 8179 + 8180 + if (vmx_get_cpl(vcpu)) { 8181 + kvm_inject_gp(vcpu, 0); 8182 + return 0; 8183 + } 8184 + 8388 8185 return 1; 8389 8186 } 8390 8187 ··· 8438 8233 vmx->vmcs01.shadow_vmcs = NULL; 8439 8234 } 8440 8235 kfree(vmx->nested.cached_vmcs12); 8236 + kfree(vmx->nested.cached_shadow_vmcs12); 8441 8237 /* Unpin physical memory we referred to in the vmcs02 */ 8442 8238 if (vmx->nested.apic_access_page) { 8443 8239 kvm_release_page_dirty(vmx->nested.apic_access_page); ··· 8524 8318 * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of 8525 8319 * 64-bit fields are to be returned). 8526 8320 */ 8527 - static inline int vmcs12_read_any(struct kvm_vcpu *vcpu, 8321 + static inline int vmcs12_read_any(struct vmcs12 *vmcs12, 8528 8322 unsigned long field, u64 *ret) 8529 8323 { 8530 8324 short offset = vmcs_field_to_offset(field); ··· 8533 8327 if (offset < 0) 8534 8328 return offset; 8535 8329 8536 - p = ((char *)(get_vmcs12(vcpu))) + offset; 8330 + p = (char *)vmcs12 + offset; 8537 8331 8538 8332 switch (vmcs_field_width(field)) { 8539 8333 case VMCS_FIELD_WIDTH_NATURAL_WIDTH: ··· 8555 8349 } 8556 8350 8557 8351 8558 - static inline int vmcs12_write_any(struct kvm_vcpu *vcpu, 8352 + static inline int vmcs12_write_any(struct vmcs12 *vmcs12, 8559 8353 unsigned long field, u64 field_value){ 8560 8354 short offset = vmcs_field_to_offset(field); 8561 - char *p = ((char *) get_vmcs12(vcpu)) + offset; 8355 + char *p = (char *)vmcs12 + offset; 8562 8356 if (offset < 0) 8563 8357 return offset; 8564 8358 ··· 8611 8405 for (i = 0; i < max_fields[q]; i++) { 8612 8406 field = fields[q][i]; 8613 8407 field_value = __vmcs_readl(field); 8614 - vmcs12_write_any(&vmx->vcpu, field, field_value); 8408 + vmcs12_write_any(get_vmcs12(&vmx->vcpu), field, field_value); 8615 8409 } 8616 8410 /* 8617 8411 * Skip the VM-exit information fields if they are read-only. ··· 8646 8440 for (q = 0; q < ARRAY_SIZE(fields); q++) { 8647 8441 for (i = 0; i < max_fields[q]; i++) { 8648 8442 field = fields[q][i]; 8649 - vmcs12_read_any(&vmx->vcpu, field, &field_value); 8443 + vmcs12_read_any(get_vmcs12(&vmx->vcpu), field, &field_value); 8650 8444 __vmcs_writel(field, field_value); 8651 8445 } 8652 8446 } ··· 8676 8470 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 8677 8471 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 8678 8472 gva_t gva = 0; 8473 + struct vmcs12 *vmcs12; 8679 8474 8680 8475 if (!nested_vmx_check_permission(vcpu)) 8681 8476 return 1; ··· 8684 8477 if (!nested_vmx_check_vmcs12(vcpu)) 8685 8478 return kvm_skip_emulated_instruction(vcpu); 8686 8479 8480 + if (!is_guest_mode(vcpu)) 8481 + vmcs12 = get_vmcs12(vcpu); 8482 + else { 8483 + /* 8484 + * When vmcs->vmcs_link_pointer is -1ull, any VMREAD 8485 + * to shadowed-field sets the ALU flags for VMfailInvalid. 8486 + */ 8487 + if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) { 8488 + nested_vmx_failInvalid(vcpu); 8489 + return kvm_skip_emulated_instruction(vcpu); 8490 + } 8491 + vmcs12 = get_shadow_vmcs12(vcpu); 8492 + } 8493 + 8687 8494 /* Decode instruction info and find the field to read */ 8688 8495 field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 8689 8496 /* Read the field, zero-extended to a u64 field_value */ 8690 - if (vmcs12_read_any(vcpu, field, &field_value) < 0) { 8497 + if (vmcs12_read_any(vmcs12, field, &field_value) < 0) { 8691 8498 nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 8692 8499 return kvm_skip_emulated_instruction(vcpu); 8693 8500 } ··· 8743 8522 */ 8744 8523 u64 field_value = 0; 8745 8524 struct x86_exception e; 8525 + struct vmcs12 *vmcs12; 8746 8526 8747 8527 if (!nested_vmx_check_permission(vcpu)) 8748 8528 return 1; ··· 8778 8556 return kvm_skip_emulated_instruction(vcpu); 8779 8557 } 8780 8558 8781 - if (vmcs12_write_any(vcpu, field, field_value) < 0) { 8559 + if (!is_guest_mode(vcpu)) 8560 + vmcs12 = get_vmcs12(vcpu); 8561 + else { 8562 + /* 8563 + * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE 8564 + * to shadowed-field sets the ALU flags for VMfailInvalid. 8565 + */ 8566 + if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) { 8567 + nested_vmx_failInvalid(vcpu); 8568 + return kvm_skip_emulated_instruction(vcpu); 8569 + } 8570 + vmcs12 = get_shadow_vmcs12(vcpu); 8571 + 8572 + } 8573 + 8574 + if (vmcs12_write_any(vmcs12, field, field_value) < 0) { 8782 8575 nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 8783 8576 return kvm_skip_emulated_instruction(vcpu); 8784 8577 } 8785 8578 8786 - switch (field) { 8579 + /* 8580 + * Do not track vmcs12 dirty-state if in guest-mode 8581 + * as we actually dirty shadow vmcs12 instead of vmcs12. 8582 + */ 8583 + if (!is_guest_mode(vcpu)) { 8584 + switch (field) { 8787 8585 #define SHADOW_FIELD_RW(x) case x: 8788 8586 #include "vmx_shadow_fields.h" 8789 - /* 8790 - * The fields that can be updated by L1 without a vmexit are 8791 - * always updated in the vmcs02, the others go down the slow 8792 - * path of prepare_vmcs02. 8793 - */ 8794 - break; 8795 - default: 8796 - vmx->nested.dirty_vmcs12 = true; 8797 - break; 8587 + /* 8588 + * The fields that can be updated by L1 without a vmexit are 8589 + * always updated in the vmcs02, the others go down the slow 8590 + * path of prepare_vmcs02. 8591 + */ 8592 + break; 8593 + default: 8594 + vmx->nested.dirty_vmcs12 = true; 8595 + break; 8596 + } 8798 8597 } 8799 8598 8800 8599 nested_vmx_succeed(vcpu); ··· 8866 8623 return kvm_skip_emulated_instruction(vcpu); 8867 8624 } 8868 8625 new_vmcs12 = kmap(page); 8869 - if (new_vmcs12->revision_id != VMCS12_REVISION) { 8626 + if (new_vmcs12->hdr.revision_id != VMCS12_REVISION || 8627 + (new_vmcs12->hdr.shadow_vmcs && 8628 + !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { 8870 8629 kunmap(page); 8871 8630 kvm_release_page_clean(page); 8872 8631 nested_vmx_failValid(vcpu, ··· 9064 8819 nested_vmx_succeed(vcpu); 9065 8820 9066 8821 return kvm_skip_emulated_instruction(vcpu); 8822 + } 8823 + 8824 + static int handle_invpcid(struct kvm_vcpu *vcpu) 8825 + { 8826 + u32 vmx_instruction_info; 8827 + unsigned long type; 8828 + bool pcid_enabled; 8829 + gva_t gva; 8830 + struct x86_exception e; 8831 + unsigned i; 8832 + unsigned long roots_to_free = 0; 8833 + struct { 8834 + u64 pcid; 8835 + u64 gla; 8836 + } operand; 8837 + 8838 + if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { 8839 + kvm_queue_exception(vcpu, UD_VECTOR); 8840 + return 1; 8841 + } 8842 + 8843 + vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 8844 + type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); 8845 + 8846 + if (type > 3) { 8847 + kvm_inject_gp(vcpu, 0); 8848 + return 1; 8849 + } 8850 + 8851 + /* According to the Intel instruction reference, the memory operand 8852 + * is read even if it isn't needed (e.g., for type==all) 8853 + */ 8854 + if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), 8855 + vmx_instruction_info, false, &gva)) 8856 + return 1; 8857 + 8858 + if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { 8859 + kvm_inject_page_fault(vcpu, &e); 8860 + return 1; 8861 + } 8862 + 8863 + if (operand.pcid >> 12 != 0) { 8864 + kvm_inject_gp(vcpu, 0); 8865 + return 1; 8866 + } 8867 + 8868 + pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); 8869 + 8870 + switch (type) { 8871 + case INVPCID_TYPE_INDIV_ADDR: 8872 + if ((!pcid_enabled && (operand.pcid != 0)) || 8873 + is_noncanonical_address(operand.gla, vcpu)) { 8874 + kvm_inject_gp(vcpu, 0); 8875 + return 1; 8876 + } 8877 + kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid); 8878 + return kvm_skip_emulated_instruction(vcpu); 8879 + 8880 + case INVPCID_TYPE_SINGLE_CTXT: 8881 + if (!pcid_enabled && (operand.pcid != 0)) { 8882 + kvm_inject_gp(vcpu, 0); 8883 + return 1; 8884 + } 8885 + 8886 + if (kvm_get_active_pcid(vcpu) == operand.pcid) { 8887 + kvm_mmu_sync_roots(vcpu); 8888 + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 8889 + } 8890 + 8891 + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) 8892 + if (kvm_get_pcid(vcpu, vcpu->arch.mmu.prev_roots[i].cr3) 8893 + == operand.pcid) 8894 + roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); 8895 + 8896 + kvm_mmu_free_roots(vcpu, roots_to_free); 8897 + /* 8898 + * If neither the current cr3 nor any of the prev_roots use the 8899 + * given PCID, then nothing needs to be done here because a 8900 + * resync will happen anyway before switching to any other CR3. 8901 + */ 8902 + 8903 + return kvm_skip_emulated_instruction(vcpu); 8904 + 8905 + case INVPCID_TYPE_ALL_NON_GLOBAL: 8906 + /* 8907 + * Currently, KVM doesn't mark global entries in the shadow 8908 + * page tables, so a non-global flush just degenerates to a 8909 + * global flush. If needed, we could optimize this later by 8910 + * keeping track of global entries in shadow page tables. 8911 + */ 8912 + 8913 + /* fall-through */ 8914 + case INVPCID_TYPE_ALL_INCL_GLOBAL: 8915 + kvm_mmu_unload(vcpu); 8916 + return kvm_skip_emulated_instruction(vcpu); 8917 + 8918 + default: 8919 + BUG(); /* We have already checked above that type <= 3 */ 8920 + } 9067 8921 } 9068 8922 9069 8923 static int handle_pml_full(struct kvm_vcpu *vcpu) ··· 9368 9024 [EXIT_REASON_XSAVES] = handle_xsaves, 9369 9025 [EXIT_REASON_XRSTORS] = handle_xrstors, 9370 9026 [EXIT_REASON_PML_FULL] = handle_pml_full, 9027 + [EXIT_REASON_INVPCID] = handle_invpcid, 9371 9028 [EXIT_REASON_VMFUNC] = handle_vmfunc, 9372 9029 [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, 9373 9030 }; ··· 9541 9196 return false; 9542 9197 } 9543 9198 9199 + static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, 9200 + struct vmcs12 *vmcs12, gpa_t bitmap) 9201 + { 9202 + u32 vmx_instruction_info; 9203 + unsigned long field; 9204 + u8 b; 9205 + 9206 + if (!nested_cpu_has_shadow_vmcs(vmcs12)) 9207 + return true; 9208 + 9209 + /* Decode instruction info and find the field to access */ 9210 + vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 9211 + field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 9212 + 9213 + /* Out-of-range fields always cause a VM exit from L2 to L1 */ 9214 + if (field >> 15) 9215 + return true; 9216 + 9217 + if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) 9218 + return true; 9219 + 9220 + return 1 & (b >> (field & 7)); 9221 + } 9222 + 9544 9223 /* 9545 9224 * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we 9546 9225 * should handle it ourselves in L0 (and then continue L2). Only call this ··· 9649 9280 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); 9650 9281 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: 9651 9282 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); 9283 + case EXIT_REASON_VMREAD: 9284 + return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 9285 + vmcs12->vmread_bitmap); 9286 + case EXIT_REASON_VMWRITE: 9287 + return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 9288 + vmcs12->vmwrite_bitmap); 9652 9289 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: 9653 9290 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: 9654 - case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: 9655 - case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE: 9291 + case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: 9656 9292 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: 9657 9293 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: 9658 9294 /* ··· 10618 10244 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); 10619 10245 10620 10246 cr3 = __get_current_cr3_fast(); 10621 - if (unlikely(cr3 != vmx->loaded_vmcs->vmcs_host_cr3)) { 10247 + if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { 10622 10248 vmcs_writel(HOST_CR3, cr3); 10623 - vmx->loaded_vmcs->vmcs_host_cr3 = cr3; 10249 + vmx->loaded_vmcs->host_state.cr3 = cr3; 10624 10250 } 10625 10251 10626 10252 cr4 = cr4_read_shadow(); 10627 - if (unlikely(cr4 != vmx->loaded_vmcs->vmcs_host_cr4)) { 10253 + if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { 10628 10254 vmcs_writel(HOST_CR4, cr4); 10629 - vmx->loaded_vmcs->vmcs_host_cr4 = cr4; 10255 + vmx->loaded_vmcs->host_state.cr4 = cr4; 10630 10256 } 10631 10257 10632 10258 /* When single-stepping over STI and MOV SS, we must clear the ··· 10822 10448 * The sysexit path does not restore ds/es, so we must set them to 10823 10449 * a reasonable value ourselves. 10824 10450 * 10825 - * We can't defer this to vmx_load_host_state() since that function 10826 - * may be executed in interrupt context, which saves and restore segments 10827 - * around it, nullifying its effect. 10451 + * We can't defer this to vmx_prepare_switch_to_host() since that 10452 + * function may be executed in interrupt context, which saves and 10453 + * restore segments around it, nullifying its effect. 10828 10454 */ 10829 10455 loadsegment(ds, __USER_DS); 10830 10456 loadsegment(es, __USER_DS); ··· 10885 10511 return; 10886 10512 10887 10513 cpu = get_cpu(); 10888 - vmx->loaded_vmcs = vmcs; 10889 10514 vmx_vcpu_put(vcpu); 10515 + vmx->loaded_vmcs = vmcs; 10890 10516 vmx_vcpu_load(vcpu, cpu); 10891 10517 put_cpu(); 10892 10518 } ··· 11026 10652 11027 10653 static int vmx_vm_init(struct kvm *kvm) 11028 10654 { 10655 + spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock); 10656 + 11029 10657 if (!ple_gap) 11030 10658 kvm->arch.pause_in_guest = true; 11031 10659 ··· 11252 10876 if (!valid_ept_address(vcpu, nested_ept_get_cr3(vcpu))) 11253 10877 return 1; 11254 10878 11255 - kvm_mmu_unload(vcpu); 11256 10879 kvm_init_shadow_ept_mmu(vcpu, 11257 10880 to_vmx(vcpu)->nested.msrs.ept_caps & 11258 10881 VMX_EPT_EXECUTE_ONLY_BIT, 11259 - nested_ept_ad_enabled(vcpu)); 10882 + nested_ept_ad_enabled(vcpu), 10883 + nested_ept_get_cr3(vcpu)); 11260 10884 vcpu->arch.mmu.set_cr3 = vmx_set_cr3; 11261 10885 vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3; 11262 10886 vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; ··· 11304 10928 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, 11305 10929 struct vmcs12 *vmcs12); 11306 10930 11307 - static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, 11308 - struct vmcs12 *vmcs12) 10931 + static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) 11309 10932 { 10933 + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 11310 10934 struct vcpu_vmx *vmx = to_vmx(vcpu); 11311 10935 struct page *page; 11312 10936 u64 hpa; ··· 11547 11171 return true; 11548 11172 } 11549 11173 11174 + static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, 11175 + struct vmcs12 *vmcs12) 11176 + { 11177 + struct vmcs12 *shadow; 11178 + struct page *page; 11179 + 11180 + if (!nested_cpu_has_shadow_vmcs(vmcs12) || 11181 + vmcs12->vmcs_link_pointer == -1ull) 11182 + return; 11183 + 11184 + shadow = get_shadow_vmcs12(vcpu); 11185 + page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer); 11186 + 11187 + memcpy(shadow, kmap(page), VMCS12_SIZE); 11188 + 11189 + kunmap(page); 11190 + kvm_release_page_clean(page); 11191 + } 11192 + 11193 + static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, 11194 + struct vmcs12 *vmcs12) 11195 + { 11196 + struct vcpu_vmx *vmx = to_vmx(vcpu); 11197 + 11198 + if (!nested_cpu_has_shadow_vmcs(vmcs12) || 11199 + vmcs12->vmcs_link_pointer == -1ull) 11200 + return; 11201 + 11202 + kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer, 11203 + get_shadow_vmcs12(vcpu), VMCS12_SIZE); 11204 + } 11205 + 11550 11206 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, 11551 11207 struct vmcs12 *vmcs12) 11552 11208 { ··· 11636 11228 unsigned long count_field, 11637 11229 unsigned long addr_field) 11638 11230 { 11231 + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 11639 11232 int maxphyaddr; 11640 11233 u64 count, addr; 11641 11234 11642 - if (vmcs12_read_any(vcpu, count_field, &count) || 11643 - vmcs12_read_any(vcpu, addr_field, &addr)) { 11235 + if (vmcs12_read_any(vmcs12, count_field, &count) || 11236 + vmcs12_read_any(vmcs12, addr_field, &addr)) { 11644 11237 WARN_ON(1); 11645 11238 return -EINVAL; 11646 11239 } ··· 11687 11278 address >> maxphyaddr) 11688 11279 return -EINVAL; 11689 11280 } 11281 + 11282 + return 0; 11283 + } 11284 + 11285 + static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, 11286 + struct vmcs12 *vmcs12) 11287 + { 11288 + if (!nested_cpu_has_shadow_vmcs(vmcs12)) 11289 + return 0; 11290 + 11291 + if (!page_address_valid(vcpu, vmcs12->vmread_bitmap) || 11292 + !page_address_valid(vcpu, vmcs12->vmwrite_bitmap)) 11293 + return -EINVAL; 11690 11294 11691 11295 return 0; 11692 11296 } ··· 11853 11431 return 1; 11854 11432 } 11855 11433 } 11856 - 11857 - vcpu->arch.cr3 = cr3; 11858 - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); 11859 11434 } 11860 11435 11861 - kvm_mmu_reset_context(vcpu); 11436 + if (!nested_ept) 11437 + kvm_mmu_new_cr3(vcpu, cr3, false); 11438 + 11439 + vcpu->arch.cr3 = cr3; 11440 + __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); 11441 + 11442 + kvm_init_mmu(vcpu, false); 11443 + 11862 11444 return 0; 11863 11445 } 11864 11446 ··· 11949 11523 * Set host-state according to L0's settings (vmcs12 is irrelevant here) 11950 11524 * Some constant fields are set here by vmx_set_constant_host_state(). 11951 11525 * Other fields are different per CPU, and will be set later when 11952 - * vmx_vcpu_load() is called, and when vmx_save_host_state() is called. 11526 + * vmx_vcpu_load() is called, and when vmx_prepare_switch_to_guest() 11527 + * is called. 11953 11528 */ 11954 11529 vmx_set_constant_host_state(vmx); 11955 11530 ··· 12022 11595 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); 12023 11596 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); 12024 11597 12025 - /* 12026 - * Not in vmcs02: GUEST_PML_INDEX, HOST_FS_SELECTOR, HOST_GS_SELECTOR, 12027 - * HOST_FS_BASE, HOST_GS_BASE. 12028 - */ 12029 - 12030 11598 if (vmx->nested.nested_run_pending && 12031 11599 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { 12032 11600 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); ··· 12085 11663 ~SECONDARY_EXEC_ENABLE_PML; 12086 11664 exec_control |= vmcs12_exec_ctrl; 12087 11665 } 11666 + 11667 + /* VMCS shadowing for L2 is emulated for now */ 11668 + exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 12088 11669 12089 11670 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 12090 11671 vmcs_write16(GUEST_INTR_STATUS, ··· 12308 11883 if (nested_vmx_check_pml_controls(vcpu, vmcs12)) 12309 11884 return VMXERR_ENTRY_INVALID_CONTROL_FIELD; 12310 11885 11886 + if (nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12)) 11887 + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; 11888 + 12311 11889 if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 12312 11890 vmx->nested.msrs.procbased_ctls_low, 12313 11891 vmx->nested.msrs.procbased_ctls_high) || ··· 12411 11983 return 0; 12412 11984 } 12413 11985 11986 + static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, 11987 + struct vmcs12 *vmcs12) 11988 + { 11989 + int r; 11990 + struct page *page; 11991 + struct vmcs12 *shadow; 11992 + 11993 + if (vmcs12->vmcs_link_pointer == -1ull) 11994 + return 0; 11995 + 11996 + if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)) 11997 + return -EINVAL; 11998 + 11999 + page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer); 12000 + if (is_error_page(page)) 12001 + return -EINVAL; 12002 + 12003 + r = 0; 12004 + shadow = kmap(page); 12005 + if (shadow->hdr.revision_id != VMCS12_REVISION || 12006 + shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)) 12007 + r = -EINVAL; 12008 + kunmap(page); 12009 + kvm_release_page_clean(page); 12010 + return r; 12011 + } 12012 + 12414 12013 static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 12415 12014 u32 *exit_qual) 12416 12015 { ··· 12449 11994 !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) 12450 11995 return 1; 12451 11996 12452 - if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS) && 12453 - vmcs12->vmcs_link_pointer != -1ull) { 11997 + if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { 12454 11998 *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR; 12455 11999 return 1; 12456 12000 } ··· 12496 12042 return 0; 12497 12043 } 12498 12044 12499 - static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu) 12045 + /* 12046 + * If exit_qual is NULL, this is being called from state restore (either RSM 12047 + * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. 12048 + */ 12049 + static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual) 12500 12050 { 12501 12051 struct vcpu_vmx *vmx = to_vmx(vcpu); 12502 12052 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 12503 - u32 exit_qual; 12504 - int r; 12053 + bool from_vmentry = !!exit_qual; 12054 + u32 dummy_exit_qual; 12055 + int r = 0; 12505 12056 12506 12057 enter_guest_mode(vcpu); 12507 12058 ··· 12520 12061 vcpu->arch.tsc_offset += vmcs12->tsc_offset; 12521 12062 12522 12063 r = EXIT_REASON_INVALID_STATE; 12523 - if (prepare_vmcs02(vcpu, vmcs12, &exit_qual)) 12064 + if (prepare_vmcs02(vcpu, vmcs12, from_vmentry ? exit_qual : &dummy_exit_qual)) 12524 12065 goto fail; 12525 12066 12526 - nested_get_vmcs12_pages(vcpu, vmcs12); 12067 + if (from_vmentry) { 12068 + nested_get_vmcs12_pages(vcpu); 12527 12069 12528 - r = EXIT_REASON_MSR_LOAD_FAIL; 12529 - exit_qual = nested_vmx_load_msr(vcpu, 12530 - vmcs12->vm_entry_msr_load_addr, 12531 - vmcs12->vm_entry_msr_load_count); 12532 - if (exit_qual) 12533 - goto fail; 12070 + r = EXIT_REASON_MSR_LOAD_FAIL; 12071 + *exit_qual = nested_vmx_load_msr(vcpu, 12072 + vmcs12->vm_entry_msr_load_addr, 12073 + vmcs12->vm_entry_msr_load_count); 12074 + if (*exit_qual) 12075 + goto fail; 12076 + } else { 12077 + /* 12078 + * The MMU is not initialized to point at the right entities yet and 12079 + * "get pages" would need to read data from the guest (i.e. we will 12080 + * need to perform gpa to hpa translation). Request a call 12081 + * to nested_get_vmcs12_pages before the next VM-entry. The MSRs 12082 + * have already been set at vmentry time and should not be reset. 12083 + */ 12084 + kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); 12085 + } 12534 12086 12535 12087 /* 12536 12088 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point ··· 12556 12086 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 12557 12087 leave_guest_mode(vcpu); 12558 12088 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 12559 - nested_vmx_entry_failure(vcpu, vmcs12, r, exit_qual); 12560 - return 1; 12089 + return r; 12561 12090 } 12562 12091 12563 12092 /* ··· 12578 12109 goto out; 12579 12110 12580 12111 vmcs12 = get_vmcs12(vcpu); 12112 + 12113 + /* 12114 + * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact 12115 + * that there *is* a valid VMCS pointer, RFLAGS.CF is set 12116 + * rather than RFLAGS.ZF, and no error number is stored to the 12117 + * VM-instruction error field. 12118 + */ 12119 + if (vmcs12->hdr.shadow_vmcs) { 12120 + nested_vmx_failInvalid(vcpu); 12121 + goto out; 12122 + } 12581 12123 12582 12124 if (enable_shadow_vmcs) 12583 12125 copy_shadow_to_vmcs12(vmx); ··· 12644 12164 */ 12645 12165 12646 12166 vmx->nested.nested_run_pending = 1; 12647 - ret = enter_vmx_non_root_mode(vcpu); 12167 + ret = enter_vmx_non_root_mode(vcpu, &exit_qual); 12648 12168 if (ret) { 12169 + nested_vmx_entry_failure(vcpu, vmcs12, ret, exit_qual); 12649 12170 vmx->nested.nested_run_pending = 0; 12650 - return ret; 12171 + return 1; 12651 12172 } 12652 12173 12653 12174 /* Hide L1D cache contents from the nested guest. */ 12654 12175 vmx->vcpu.arch.l1tf_flush_l1d = true; 12176 + 12177 + /* 12178 + * Must happen outside of enter_vmx_non_root_mode() as it will 12179 + * also be used as part of restoring nVMX state for 12180 + * snapshot restore (migration). 12181 + * 12182 + * In this flow, it is assumed that vmcs12 cache was 12183 + * trasferred as part of captured nVMX state and should 12184 + * therefore not be read from guest memory (which may not 12185 + * exist on destination host yet). 12186 + */ 12187 + nested_cache_shadow_vmcs12(vcpu, vmcs12); 12655 12188 12656 12189 /* 12657 12190 * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken ··· 13174 12681 else 13175 12682 prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, 13176 12683 exit_qualification); 12684 + 12685 + /* 12686 + * Must happen outside of sync_vmcs12() as it will 12687 + * also be used to capture vmcs12 cache as part of 12688 + * capturing nVMX state for snapshot (migration). 12689 + * 12690 + * Otherwise, this flush will dirty guest memory at a 12691 + * point it is already assumed by user-space to be 12692 + * immutable. 12693 + */ 12694 + nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); 13177 12695 13178 12696 if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr, 13179 12697 vmcs12->vm_exit_msr_store_count)) ··· 13760 13256 13761 13257 if (vmx->nested.smm.guest_mode) { 13762 13258 vcpu->arch.hflags &= ~HF_SMM_MASK; 13763 - ret = enter_vmx_non_root_mode(vcpu); 13259 + ret = enter_vmx_non_root_mode(vcpu, NULL); 13764 13260 vcpu->arch.hflags |= HF_SMM_MASK; 13765 13261 if (ret) 13766 13262 return ret; ··· 13772 13268 13773 13269 static int enable_smi_window(struct kvm_vcpu *vcpu) 13774 13270 { 13271 + return 0; 13272 + } 13273 + 13274 + static int vmx_get_nested_state(struct kvm_vcpu *vcpu, 13275 + struct kvm_nested_state __user *user_kvm_nested_state, 13276 + u32 user_data_size) 13277 + { 13278 + struct vcpu_vmx *vmx; 13279 + struct vmcs12 *vmcs12; 13280 + struct kvm_nested_state kvm_state = { 13281 + .flags = 0, 13282 + .format = 0, 13283 + .size = sizeof(kvm_state), 13284 + .vmx.vmxon_pa = -1ull, 13285 + .vmx.vmcs_pa = -1ull, 13286 + }; 13287 + 13288 + if (!vcpu) 13289 + return kvm_state.size + 2 * VMCS12_SIZE; 13290 + 13291 + vmx = to_vmx(vcpu); 13292 + vmcs12 = get_vmcs12(vcpu); 13293 + if (nested_vmx_allowed(vcpu) && 13294 + (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { 13295 + kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr; 13296 + kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr; 13297 + 13298 + if (vmx->nested.current_vmptr != -1ull) { 13299 + kvm_state.size += VMCS12_SIZE; 13300 + 13301 + if (is_guest_mode(vcpu) && 13302 + nested_cpu_has_shadow_vmcs(vmcs12) && 13303 + vmcs12->vmcs_link_pointer != -1ull) 13304 + kvm_state.size += VMCS12_SIZE; 13305 + } 13306 + 13307 + if (vmx->nested.smm.vmxon) 13308 + kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; 13309 + 13310 + if (vmx->nested.smm.guest_mode) 13311 + kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; 13312 + 13313 + if (is_guest_mode(vcpu)) { 13314 + kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; 13315 + 13316 + if (vmx->nested.nested_run_pending) 13317 + kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; 13318 + } 13319 + } 13320 + 13321 + if (user_data_size < kvm_state.size) 13322 + goto out; 13323 + 13324 + if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) 13325 + return -EFAULT; 13326 + 13327 + if (vmx->nested.current_vmptr == -1ull) 13328 + goto out; 13329 + 13330 + /* 13331 + * When running L2, the authoritative vmcs12 state is in the 13332 + * vmcs02. When running L1, the authoritative vmcs12 state is 13333 + * in the shadow vmcs linked to vmcs01, unless 13334 + * sync_shadow_vmcs is set, in which case, the authoritative 13335 + * vmcs12 state is in the vmcs12 already. 13336 + */ 13337 + if (is_guest_mode(vcpu)) 13338 + sync_vmcs12(vcpu, vmcs12); 13339 + else if (enable_shadow_vmcs && !vmx->nested.sync_shadow_vmcs) 13340 + copy_shadow_to_vmcs12(vmx); 13341 + 13342 + if (copy_to_user(user_kvm_nested_state->data, vmcs12, sizeof(*vmcs12))) 13343 + return -EFAULT; 13344 + 13345 + if (nested_cpu_has_shadow_vmcs(vmcs12) && 13346 + vmcs12->vmcs_link_pointer != -1ull) { 13347 + if (copy_to_user(user_kvm_nested_state->data + VMCS12_SIZE, 13348 + get_shadow_vmcs12(vcpu), sizeof(*vmcs12))) 13349 + return -EFAULT; 13350 + } 13351 + 13352 + out: 13353 + return kvm_state.size; 13354 + } 13355 + 13356 + static int vmx_set_nested_state(struct kvm_vcpu *vcpu, 13357 + struct kvm_nested_state __user *user_kvm_nested_state, 13358 + struct kvm_nested_state *kvm_state) 13359 + { 13360 + struct vcpu_vmx *vmx = to_vmx(vcpu); 13361 + struct vmcs12 *vmcs12; 13362 + u32 exit_qual; 13363 + int ret; 13364 + 13365 + if (kvm_state->format != 0) 13366 + return -EINVAL; 13367 + 13368 + if (!nested_vmx_allowed(vcpu)) 13369 + return kvm_state->vmx.vmxon_pa == -1ull ? 0 : -EINVAL; 13370 + 13371 + if (kvm_state->vmx.vmxon_pa == -1ull) { 13372 + if (kvm_state->vmx.smm.flags) 13373 + return -EINVAL; 13374 + 13375 + if (kvm_state->vmx.vmcs_pa != -1ull) 13376 + return -EINVAL; 13377 + 13378 + vmx_leave_nested(vcpu); 13379 + return 0; 13380 + } 13381 + 13382 + if (!page_address_valid(vcpu, kvm_state->vmx.vmxon_pa)) 13383 + return -EINVAL; 13384 + 13385 + if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12)) 13386 + return -EINVAL; 13387 + 13388 + if (kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa || 13389 + !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa)) 13390 + return -EINVAL; 13391 + 13392 + if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 13393 + (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 13394 + return -EINVAL; 13395 + 13396 + if (kvm_state->vmx.smm.flags & 13397 + ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) 13398 + return -EINVAL; 13399 + 13400 + if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 13401 + !(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) 13402 + return -EINVAL; 13403 + 13404 + vmx_leave_nested(vcpu); 13405 + if (kvm_state->vmx.vmxon_pa == -1ull) 13406 + return 0; 13407 + 13408 + vmx->nested.vmxon_ptr = kvm_state->vmx.vmxon_pa; 13409 + ret = enter_vmx_operation(vcpu); 13410 + if (ret) 13411 + return ret; 13412 + 13413 + set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa); 13414 + 13415 + if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { 13416 + vmx->nested.smm.vmxon = true; 13417 + vmx->nested.vmxon = false; 13418 + 13419 + if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) 13420 + vmx->nested.smm.guest_mode = true; 13421 + } 13422 + 13423 + vmcs12 = get_vmcs12(vcpu); 13424 + if (copy_from_user(vmcs12, user_kvm_nested_state->data, sizeof(*vmcs12))) 13425 + return -EFAULT; 13426 + 13427 + if (vmcs12->hdr.revision_id != VMCS12_REVISION) 13428 + return -EINVAL; 13429 + 13430 + if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 13431 + return 0; 13432 + 13433 + vmx->nested.nested_run_pending = 13434 + !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); 13435 + 13436 + if (nested_cpu_has_shadow_vmcs(vmcs12) && 13437 + vmcs12->vmcs_link_pointer != -1ull) { 13438 + struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); 13439 + if (kvm_state->size < sizeof(kvm_state) + 2 * sizeof(*vmcs12)) 13440 + return -EINVAL; 13441 + 13442 + if (copy_from_user(shadow_vmcs12, 13443 + user_kvm_nested_state->data + VMCS12_SIZE, 13444 + sizeof(*vmcs12))) 13445 + return -EFAULT; 13446 + 13447 + if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || 13448 + !shadow_vmcs12->hdr.shadow_vmcs) 13449 + return -EINVAL; 13450 + } 13451 + 13452 + if (check_vmentry_prereqs(vcpu, vmcs12) || 13453 + check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) 13454 + return -EINVAL; 13455 + 13456 + if (kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING) 13457 + vmx->nested.nested_run_pending = 1; 13458 + 13459 + vmx->nested.dirty_vmcs12 = true; 13460 + ret = enter_vmx_non_root_mode(vcpu, NULL); 13461 + if (ret) 13462 + return -EINVAL; 13463 + 13775 13464 return 0; 13776 13465 } 13777 13466 ··· 13987 13290 .vcpu_free = vmx_free_vcpu, 13988 13291 .vcpu_reset = vmx_vcpu_reset, 13989 13292 13990 - .prepare_guest_switch = vmx_save_host_state, 13293 + .prepare_guest_switch = vmx_prepare_switch_to_guest, 13991 13294 .vcpu_load = vmx_vcpu_load, 13992 13295 .vcpu_put = vmx_vcpu_put, 13993 13296 ··· 14020 13323 .set_rflags = vmx_set_rflags, 14021 13324 14022 13325 .tlb_flush = vmx_flush_tlb, 13326 + .tlb_flush_gva = vmx_flush_tlb_gva, 14023 13327 14024 13328 .run = vmx_vcpu_run, 14025 13329 .handle_exit = vmx_handle_exit, ··· 14102 13404 #endif 14103 13405 14104 13406 .setup_mce = vmx_setup_mce, 13407 + 13408 + .get_nested_state = vmx_get_nested_state, 13409 + .set_nested_state = vmx_set_nested_state, 13410 + .get_vmcs12_pages = nested_get_vmcs12_pages, 14105 13411 14106 13412 .smi_allowed = vmx_smi_allowed, 14107 13413 .pre_enter_smm = vmx_pre_enter_smm,

+91 -17

arch/x86/kvm/x86.c

··· 848 848 849 849 int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 850 850 { 851 + bool skip_tlb_flush = false; 851 852 #ifdef CONFIG_X86_64 852 853 bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); 853 854 854 - if (pcid_enabled) 855 - cr3 &= ~CR3_PCID_INVD; 855 + if (pcid_enabled) { 856 + skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH; 857 + cr3 &= ~X86_CR3_PCID_NOFLUSH; 858 + } 856 859 #endif 857 860 858 861 if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) { 859 - kvm_mmu_sync_roots(vcpu); 860 - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 862 + if (!skip_tlb_flush) { 863 + kvm_mmu_sync_roots(vcpu); 864 + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 865 + } 861 866 return 0; 862 867 } 863 868 ··· 873 868 !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) 874 869 return 1; 875 870 871 + kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush); 876 872 vcpu->arch.cr3 = cr3; 877 873 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); 878 - kvm_mmu_new_cr3(vcpu); 874 + 879 875 return 0; 880 876 } 881 877 EXPORT_SYMBOL_GPL(kvm_set_cr3); ··· 2191 2185 vcpu->arch.mcg_status = data; 2192 2186 break; 2193 2187 case MSR_IA32_MCG_CTL: 2194 - if (!(mcg_cap & MCG_CTL_P)) 2188 + if (!(mcg_cap & MCG_CTL_P) && 2189 + (data || !msr_info->host_initiated)) 2195 2190 return 1; 2196 2191 if (data != 0 && data != ~(u64)0) 2197 - return -1; 2192 + return 1; 2198 2193 vcpu->arch.mcg_ctl = data; 2199 2194 break; 2200 2195 default: ··· 2583 2576 } 2584 2577 EXPORT_SYMBOL_GPL(kvm_get_msr); 2585 2578 2586 - static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 2579 + static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) 2587 2580 { 2588 2581 u64 data; 2589 2582 u64 mcg_cap = vcpu->arch.mcg_cap; ··· 2598 2591 data = vcpu->arch.mcg_cap; 2599 2592 break; 2600 2593 case MSR_IA32_MCG_CTL: 2601 - if (!(mcg_cap & MCG_CTL_P)) 2594 + if (!(mcg_cap & MCG_CTL_P) && !host) 2602 2595 return 1; 2603 2596 data = vcpu->arch.mcg_ctl; 2604 2597 break; ··· 2731 2724 case MSR_IA32_MCG_CTL: 2732 2725 case MSR_IA32_MCG_STATUS: 2733 2726 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: 2734 - return get_msr_mce(vcpu, msr_info->index, &msr_info->data); 2727 + return get_msr_mce(vcpu, msr_info->index, &msr_info->data, 2728 + msr_info->host_initiated); 2735 2729 case MSR_K7_CLK_CTL: 2736 2730 /* 2737 2731 * Provide expected ramp-up count for K7. All other ··· 2753 2745 case HV_X64_MSR_TSC_EMULATION_CONTROL: 2754 2746 case HV_X64_MSR_TSC_EMULATION_STATUS: 2755 2747 return kvm_hv_get_msr_common(vcpu, 2756 - msr_info->index, &msr_info->data); 2748 + msr_info->index, &msr_info->data, 2749 + msr_info->host_initiated); 2757 2750 break; 2758 2751 case MSR_IA32_BBL_CR_CTL3: 2759 2752 /* This legacy MSR exists but isn't fully documented in current ··· 2977 2968 break; 2978 2969 case KVM_CAP_X2APIC_API: 2979 2970 r = KVM_X2APIC_API_VALID_FLAGS; 2971 + break; 2972 + case KVM_CAP_NESTED_STATE: 2973 + r = kvm_x86_ops->get_nested_state ? 2974 + kvm_x86_ops->get_nested_state(NULL, 0, 0) : 0; 2980 2975 break; 2981 2976 default: 2982 2977 break; ··· 3996 3983 if (copy_from_user(&cap, argp, sizeof(cap))) 3997 3984 goto out; 3998 3985 r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap); 3986 + break; 3987 + } 3988 + case KVM_GET_NESTED_STATE: { 3989 + struct kvm_nested_state __user *user_kvm_nested_state = argp; 3990 + u32 user_data_size; 3991 + 3992 + r = -EINVAL; 3993 + if (!kvm_x86_ops->get_nested_state) 3994 + break; 3995 + 3996 + BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size)); 3997 + if (get_user(user_data_size, &user_kvm_nested_state->size)) 3998 + return -EFAULT; 3999 + 4000 + r = kvm_x86_ops->get_nested_state(vcpu, user_kvm_nested_state, 4001 + user_data_size); 4002 + if (r < 0) 4003 + return r; 4004 + 4005 + if (r > user_data_size) { 4006 + if (put_user(r, &user_kvm_nested_state->size)) 4007 + return -EFAULT; 4008 + return -E2BIG; 4009 + } 4010 + r = 0; 4011 + break; 4012 + } 4013 + case KVM_SET_NESTED_STATE: { 4014 + struct kvm_nested_state __user *user_kvm_nested_state = argp; 4015 + struct kvm_nested_state kvm_state; 4016 + 4017 + r = -EINVAL; 4018 + if (!kvm_x86_ops->set_nested_state) 4019 + break; 4020 + 4021 + if (copy_from_user(&kvm_state, user_kvm_nested_state, sizeof(kvm_state))) 4022 + return -EFAULT; 4023 + 4024 + if (kvm_state.size < sizeof(kvm_state)) 4025 + return -EINVAL; 4026 + 4027 + if (kvm_state.flags & 4028 + ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE)) 4029 + return -EINVAL; 4030 + 4031 + /* nested_run_pending implies guest_mode. */ 4032 + if (kvm_state.flags == KVM_STATE_NESTED_RUN_PENDING) 4033 + return -EINVAL; 4034 + 4035 + r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state); 3999 4036 break; 4000 4037 } 4001 4038 default: ··· 6566 6503 * Set the reserved bits and the present bit of an paging-structure 6567 6504 * entry to generate page fault with PFER.RSV = 1. 6568 6505 */ 6569 - /* Mask the reserved physical address bits. */ 6570 - mask = rsvd_bits(maxphyaddr, 51); 6506 + 6507 + /* 6508 + * Mask the uppermost physical address bit, which would be reserved as 6509 + * long as the supported physical address width is less than 52. 6510 + */ 6511 + mask = 1ull << 51; 6571 6512 6572 6513 /* Set the present bit. */ 6573 6514 mask |= 1ull; ··· 6835 6768 #ifdef CONFIG_X86_64 6836 6769 case KVM_HC_CLOCK_PAIRING: 6837 6770 ret = kvm_pv_clock_pairing(vcpu, a0, a1); 6771 + break; 6772 + case KVM_HC_SEND_IPI: 6773 + ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit); 6838 6774 break; 6839 6775 #endif 6840 6776 default: ··· 7357 7287 bool req_immediate_exit = false; 7358 7288 7359 7289 if (kvm_request_pending(vcpu)) { 7290 + if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) 7291 + kvm_x86_ops->get_vmcs12_pages(vcpu); 7360 7292 if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) 7361 7293 kvm_mmu_unload(vcpu); 7362 7294 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) ··· 7374 7302 } 7375 7303 if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) 7376 7304 kvm_mmu_sync_roots(vcpu); 7305 + if (kvm_check_request(KVM_REQ_LOAD_CR3, vcpu)) 7306 + kvm_mmu_load_cr3(vcpu); 7377 7307 if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) 7378 7308 kvm_vcpu_flush_tlb(vcpu, true); 7379 7309 if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { ··· 8087 8013 8088 8014 static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) 8089 8015 { 8016 + if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && 8017 + (sregs->cr4 & X86_CR4_OSXSAVE)) 8018 + return -EINVAL; 8019 + 8090 8020 if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) { 8091 8021 /* 8092 8022 * When EFER.LME and CR0.PG are set, the processor is in ··· 8120 8042 int pending_vec, max_bits, idx; 8121 8043 struct desc_ptr dt; 8122 8044 int ret = -EINVAL; 8123 - 8124 - if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && 8125 - (sregs->cr4 & X86_CR4_OSXSAVE)) 8126 - goto out; 8127 8045 8128 8046 if (kvm_valid_sregs(vcpu, sregs)) 8129 8047 goto out;

+19 -5

include/linux/kvm_host.h

··· 130 130 #define KVM_REQUEST_ARCH_BASE 8 131 131 132 132 #define KVM_ARCH_REQ_FLAGS(nr, flags) ({ \ 133 - BUILD_BUG_ON((unsigned)(nr) >= 32 - KVM_REQUEST_ARCH_BASE); \ 133 + BUILD_BUG_ON((unsigned)(nr) >= (FIELD_SIZEOF(struct kvm_vcpu, requests) * 8) - KVM_REQUEST_ARCH_BASE); \ 134 134 (unsigned)(((nr) + KVM_REQUEST_ARCH_BASE) | (flags)); \ 135 135 }) 136 136 #define KVM_ARCH_REQ(nr) KVM_ARCH_REQ_FLAGS(nr, 0) ··· 224 224 int vcpu_id; 225 225 int srcu_idx; 226 226 int mode; 227 - unsigned long requests; 227 + u64 requests; 228 228 unsigned long guest_debug; 229 229 230 230 int pre_pcpu; ··· 307 307 static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memslot) 308 308 { 309 309 return ALIGN(memslot->npages, BITS_PER_LONG) / 8; 310 + } 311 + 312 + static inline unsigned long *kvm_second_dirty_bitmap(struct kvm_memory_slot *memslot) 313 + { 314 + unsigned long len = kvm_dirty_bitmap_bytes(memslot); 315 + 316 + return memslot->dirty_bitmap + len / sizeof(*memslot->dirty_bitmap); 310 317 } 311 318 312 319 struct kvm_s390_adapter_int { ··· 834 827 } 835 828 #endif 836 829 830 + #ifndef __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB 831 + static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm) 832 + { 833 + return -ENOTSUPP; 834 + } 835 + #endif 836 + 837 837 #ifdef __KVM_HAVE_ARCH_NONCOHERENT_DMA 838 838 void kvm_arch_register_noncoherent_dma(struct kvm *kvm); 839 839 void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm); ··· 1138 1124 * caller. Paired with the smp_mb__after_atomic in kvm_check_request. 1139 1125 */ 1140 1126 smp_wmb(); 1141 - set_bit(req & KVM_REQUEST_MASK, &vcpu->requests); 1127 + set_bit(req & KVM_REQUEST_MASK, (void *)&vcpu->requests); 1142 1128 } 1143 1129 1144 1130 static inline bool kvm_request_pending(struct kvm_vcpu *vcpu) ··· 1148 1134 1149 1135 static inline bool kvm_test_request(int req, struct kvm_vcpu *vcpu) 1150 1136 { 1151 - return test_bit(req & KVM_REQUEST_MASK, &vcpu->requests); 1137 + return test_bit(req & KVM_REQUEST_MASK, (void *)&vcpu->requests); 1152 1138 } 1153 1139 1154 1140 static inline void kvm_clear_request(int req, struct kvm_vcpu *vcpu) 1155 1141 { 1156 - clear_bit(req & KVM_REQUEST_MASK, &vcpu->requests); 1142 + clear_bit(req & KVM_REQUEST_MASK, (void *)&vcpu->requests); 1157 1143 } 1158 1144 1159 1145 static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)

+4

include/uapi/linux/kvm.h

··· 950 950 #define KVM_CAP_HYPERV_EVENTFD 154 951 951 #define KVM_CAP_HYPERV_TLBFLUSH 155 952 952 #define KVM_CAP_S390_HPAGE_1M 156 953 + #define KVM_CAP_NESTED_STATE 157 953 954 954 955 #ifdef KVM_CAP_IRQ_ROUTING 955 956 ··· 1393 1392 /* Available with KVM_CAP_HYPERV_EVENTFD */ 1394 1393 #define KVM_HYPERV_EVENTFD _IOW(KVMIO, 0xbd, struct kvm_hyperv_eventfd) 1395 1394 1395 + /* Available with KVM_CAP_NESTED_STATE */ 1396 + #define KVM_GET_NESTED_STATE _IOWR(KVMIO, 0xbe, struct kvm_nested_state) 1397 + #define KVM_SET_NESTED_STATE _IOW(KVMIO, 0xbf, struct kvm_nested_state) 1396 1398 1397 1399 /* Secure Encrypted Virtualization command */ 1398 1400 enum sev_cmd_id {

+2

include/uapi/linux/kvm_para.h

··· 13 13 /* Return values for hypercalls */ 14 14 #define KVM_ENOSYS 1000 15 15 #define KVM_EFAULT EFAULT 16 + #define KVM_EINVAL EINVAL 16 17 #define KVM_E2BIG E2BIG 17 18 #define KVM_EPERM EPERM 18 19 #define KVM_EOPNOTSUPP 95 ··· 27 26 #define KVM_HC_MIPS_EXIT_VM 7 28 27 #define KVM_HC_MIPS_CONSOLE_OUTPUT 8 29 28 #define KVM_HC_CLOCK_PAIRING 9 29 + #define KVM_HC_SEND_IPI 10 30 30 31 31 /* 32 32 * hypercalls use architecture specific

+2

tools/testing/selftests/kvm/.gitignore

··· 1 + cr4_cpuid_sync_test 1 2 set_sregs_test 2 3 sync_regs_test 3 4 vmx_tsc_adjust_test 5 + state_test

+2

tools/testing/selftests/kvm/Makefile

··· 9 9 TEST_GEN_PROGS_x86_64 = set_sregs_test 10 10 TEST_GEN_PROGS_x86_64 += sync_regs_test 11 11 TEST_GEN_PROGS_x86_64 += vmx_tsc_adjust_test 12 + TEST_GEN_PROGS_x86_64 += cr4_cpuid_sync_test 13 + TEST_GEN_PROGS_x86_64 += state_test 12 14 13 15 TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M)) 14 16 LIBKVM += $(LIBKVM_$(UNAME_M))

+129

tools/testing/selftests/kvm/cr4_cpuid_sync_test.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * CR4 and CPUID sync test 4 + * 5 + * Copyright 2018, Red Hat, Inc. and/or its affiliates. 6 + * 7 + * Author: 8 + * Wei Huang <wei@redhat.com> 9 + */ 10 + 11 + #include <fcntl.h> 12 + #include <stdio.h> 13 + #include <stdlib.h> 14 + #include <string.h> 15 + #include <sys/ioctl.h> 16 + 17 + #include "test_util.h" 18 + 19 + #include "kvm_util.h" 20 + #include "x86.h" 21 + 22 + #define X86_FEATURE_XSAVE (1<<26) 23 + #define X86_FEATURE_OSXSAVE (1<<27) 24 + #define VCPU_ID 1 25 + 26 + enum { 27 + GUEST_UPDATE_CR4 = 0x1000, 28 + GUEST_FAILED, 29 + GUEST_DONE, 30 + }; 31 + 32 + static void exit_to_hv(uint16_t port) 33 + { 34 + __asm__ __volatile__("in %[port], %%al" 35 + : 36 + : [port]"d"(port) 37 + : "rax"); 38 + } 39 + 40 + static inline bool cr4_cpuid_is_sync(void) 41 + { 42 + int func, subfunc; 43 + uint32_t eax, ebx, ecx, edx; 44 + uint64_t cr4; 45 + 46 + func = 0x1; 47 + subfunc = 0x0; 48 + __asm__ __volatile__("cpuid" 49 + : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) 50 + : "a"(func), "c"(subfunc)); 51 + 52 + cr4 = get_cr4(); 53 + 54 + return (!!(ecx & X86_FEATURE_OSXSAVE)) == (!!(cr4 & X86_CR4_OSXSAVE)); 55 + } 56 + 57 + static void guest_code(void) 58 + { 59 + uint64_t cr4; 60 + 61 + /* turn on CR4.OSXSAVE */ 62 + cr4 = get_cr4(); 63 + cr4 |= X86_CR4_OSXSAVE; 64 + set_cr4(cr4); 65 + 66 + /* verify CR4.OSXSAVE == CPUID.OSXSAVE */ 67 + if (!cr4_cpuid_is_sync()) 68 + exit_to_hv(GUEST_FAILED); 69 + 70 + /* notify hypervisor to change CR4 */ 71 + exit_to_hv(GUEST_UPDATE_CR4); 72 + 73 + /* check again */ 74 + if (!cr4_cpuid_is_sync()) 75 + exit_to_hv(GUEST_FAILED); 76 + 77 + exit_to_hv(GUEST_DONE); 78 + } 79 + 80 + int main(int argc, char *argv[]) 81 + { 82 + struct kvm_run *run; 83 + struct kvm_vm *vm; 84 + struct kvm_sregs sregs; 85 + struct kvm_cpuid_entry2 *entry; 86 + int rc; 87 + 88 + entry = kvm_get_supported_cpuid_entry(1); 89 + if (!(entry->ecx & X86_FEATURE_XSAVE)) { 90 + printf("XSAVE feature not supported, skipping test\n"); 91 + return 0; 92 + } 93 + 94 + /* Tell stdout not to buffer its content */ 95 + setbuf(stdout, NULL); 96 + 97 + /* Create VM */ 98 + vm = vm_create_default(VCPU_ID, guest_code); 99 + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); 100 + run = vcpu_state(vm, VCPU_ID); 101 + 102 + while (1) { 103 + rc = _vcpu_run(vm, VCPU_ID); 104 + 105 + if (run->exit_reason == KVM_EXIT_IO) { 106 + switch (run->io.port) { 107 + case GUEST_UPDATE_CR4: 108 + /* emulate hypervisor clearing CR4.OSXSAVE */ 109 + vcpu_sregs_get(vm, VCPU_ID, &sregs); 110 + sregs.cr4 &= ~X86_CR4_OSXSAVE; 111 + vcpu_sregs_set(vm, VCPU_ID, &sregs); 112 + break; 113 + case GUEST_FAILED: 114 + TEST_ASSERT(false, "Guest CR4 bit (OSXSAVE) unsynchronized with CPUID bit."); 115 + break; 116 + case GUEST_DONE: 117 + goto done; 118 + default: 119 + TEST_ASSERT(false, "Unknown port 0x%x.", 120 + run->io.port); 121 + } 122 + } 123 + } 124 + 125 + kvm_vm_free(vm); 126 + 127 + done: 128 + return 0; 129 + }

+3 -1

tools/testing/selftests/kvm/include/kvm_util.h

··· 53 53 54 54 struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm); 55 55 void kvm_vm_free(struct kvm_vm *vmp); 56 + void kvm_vm_restart(struct kvm_vm *vmp, int perm); 57 + void kvm_vm_release(struct kvm_vm *vmp); 56 58 57 59 int kvm_memcmp_hva_gva(void *hva, 58 60 struct kvm_vm *vm, const vm_vaddr_t gva, size_t len); ··· 77 75 uint32_t vcpuid, unsigned long ioctl, void *arg); 78 76 void vm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg); 79 77 void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags); 80 - void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid); 78 + void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid, int pgd_memslot, int gdt_memslot); 81 79 vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, 82 80 uint32_t data_memslot, uint32_t pgd_memslot); 83 81 void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa);

+62 -4

tools/testing/selftests/kvm/include/vmx.h

··· 380 380 return ret; 381 381 } 382 382 383 + static inline int vmptrst(uint64_t *value) 384 + { 385 + uint64_t tmp; 386 + uint8_t ret; 387 + 388 + __asm__ __volatile__("vmptrst %[value]; setna %[ret]" 389 + : [value]"=m"(tmp), [ret]"=rm"(ret) 390 + : : "cc", "memory"); 391 + 392 + *value = tmp; 393 + return ret; 394 + } 395 + 396 + /* 397 + * A wrapper around vmptrst that ignores errors and returns zero if the 398 + * vmptrst instruction fails. 399 + */ 400 + static inline uint64_t vmptrstz(void) 401 + { 402 + uint64_t value = 0; 403 + vmptrst(&value); 404 + return value; 405 + } 406 + 383 407 /* 384 408 * No guest state (e.g. GPRs) is established by this vmlaunch. 385 409 */ ··· 468 444 return ret; 469 445 } 470 446 447 + static inline void vmcall(void) 448 + { 449 + /* Currently, L1 destroys our GPRs during vmexits. */ 450 + __asm__ __volatile__("push %%rbp; vmcall; pop %%rbp" : : : 451 + "rax", "rbx", "rcx", "rdx", 452 + "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", 453 + "r13", "r14", "r15"); 454 + } 455 + 471 456 static inline int vmread(uint64_t encoding, uint64_t *value) 472 457 { 473 458 uint64_t tmp; ··· 519 486 return rdmsr(MSR_IA32_VMX_BASIC); 520 487 } 521 488 522 - void prepare_for_vmx_operation(void); 523 - void prepare_vmcs(void *guest_rip, void *guest_rsp); 524 - struct kvm_vm *vm_create_default_vmx(uint32_t vcpuid, 525 - vmx_guest_code_t guest_code); 489 + struct vmx_pages { 490 + void *vmxon_hva; 491 + uint64_t vmxon_gpa; 492 + void *vmxon; 493 + 494 + void *vmcs_hva; 495 + uint64_t vmcs_gpa; 496 + void *vmcs; 497 + 498 + void *msr_hva; 499 + uint64_t msr_gpa; 500 + void *msr; 501 + 502 + void *shadow_vmcs_hva; 503 + uint64_t shadow_vmcs_gpa; 504 + void *shadow_vmcs; 505 + 506 + void *vmread_hva; 507 + uint64_t vmread_gpa; 508 + void *vmread; 509 + 510 + void *vmwrite_hva; 511 + uint64_t vmwrite_gpa; 512 + void *vmwrite; 513 + }; 514 + 515 + struct vmx_pages *vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva); 516 + bool prepare_for_vmx_operation(struct vmx_pages *vmx); 517 + void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp); 526 518 527 519 #endif /* !SELFTEST_KVM_VMX_H */

+6 -2

tools/testing/selftests/kvm/include/x86.h

··· 59 59 struct desc64 { 60 60 uint16_t limit0; 61 61 uint16_t base0; 62 - unsigned base1:8, type:5, dpl:2, p:1; 63 - unsigned limit1:4, zero0:3, g:1, base2:8; 62 + unsigned base1:8, s:1, type:4, dpl:2, p:1; 63 + unsigned limit1:4, avl:1, l:1, db:1, g:1, base2:8; 64 64 uint32_t base3; 65 65 uint32_t zero1; 66 66 } __attribute__((packed)); ··· 302 302 } 303 303 return 0; 304 304 } 305 + 306 + struct kvm_x86_state; 307 + struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid); 308 + void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *state); 305 309 306 310 /* 307 311 * Basic CPU control in CR0

+73 -21

tools/testing/selftests/kvm/lib/kvm_util.c

··· 62 62 return ret; 63 63 } 64 64 65 + static void vm_open(struct kvm_vm *vm, int perm) 66 + { 67 + vm->kvm_fd = open(KVM_DEV_PATH, perm); 68 + if (vm->kvm_fd < 0) 69 + exit(KSFT_SKIP); 70 + 71 + /* Create VM. */ 72 + vm->fd = ioctl(vm->kvm_fd, KVM_CREATE_VM, NULL); 73 + TEST_ASSERT(vm->fd >= 0, "KVM_CREATE_VM ioctl failed, " 74 + "rc: %i errno: %i", vm->fd, errno); 75 + } 76 + 65 77 /* VM Create 66 78 * 67 79 * Input Args: ··· 102 90 TEST_ASSERT(vm != NULL, "Insufficent Memory"); 103 91 104 92 vm->mode = mode; 105 - kvm_fd = open(KVM_DEV_PATH, perm); 106 - if (kvm_fd < 0) 107 - exit(KSFT_SKIP); 108 - 109 - /* Create VM. */ 110 - vm->fd = ioctl(kvm_fd, KVM_CREATE_VM, NULL); 111 - TEST_ASSERT(vm->fd >= 0, "KVM_CREATE_VM ioctl failed, " 112 - "rc: %i errno: %i", vm->fd, errno); 113 - 114 - close(kvm_fd); 93 + vm_open(vm, perm); 115 94 116 95 /* Setup mode specific traits. */ 117 96 switch (vm->mode) { ··· 133 130 0, 0, phy_pages, 0); 134 131 135 132 return vm; 133 + } 134 + 135 + /* VM Restart 136 + * 137 + * Input Args: 138 + * vm - VM that has been released before 139 + * perm - permission 140 + * 141 + * Output Args: None 142 + * 143 + * Reopens the file descriptors associated to the VM and reinstates the 144 + * global state, such as the irqchip and the memory regions that are mapped 145 + * into the guest. 146 + */ 147 + void kvm_vm_restart(struct kvm_vm *vmp, int perm) 148 + { 149 + struct userspace_mem_region *region; 150 + 151 + vm_open(vmp, perm); 152 + if (vmp->has_irqchip) 153 + vm_create_irqchip(vmp); 154 + 155 + for (region = vmp->userspace_mem_region_head; region; 156 + region = region->next) { 157 + int ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION, &region->region); 158 + TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n" 159 + " rc: %i errno: %i\n" 160 + " slot: %u flags: 0x%x\n" 161 + " guest_phys_addr: 0x%lx size: 0x%lx", 162 + ret, errno, region->region.slot, region->region.flags, 163 + region->region.guest_phys_addr, 164 + region->region.memory_size); 165 + } 136 166 } 137 167 138 168 /* Userspace Memory Region Find ··· 274 238 static void vm_vcpu_rm(struct kvm_vm *vm, uint32_t vcpuid) 275 239 { 276 240 struct vcpu *vcpu = vcpu_find(vm, vcpuid); 241 + int ret; 277 242 278 - int ret = close(vcpu->fd); 243 + ret = munmap(vcpu->state, sizeof(*vcpu->state)); 244 + TEST_ASSERT(ret == 0, "munmap of VCPU fd failed, rc: %i " 245 + "errno: %i", ret, errno); 246 + close(vcpu->fd); 279 247 TEST_ASSERT(ret == 0, "Close of VCPU fd failed, rc: %i " 280 248 "errno: %i", ret, errno); 281 249 ··· 292 252 free(vcpu); 293 253 } 294 254 255 + void kvm_vm_release(struct kvm_vm *vmp) 256 + { 257 + int ret; 258 + 259 + /* Free VCPUs. */ 260 + while (vmp->vcpu_head) 261 + vm_vcpu_rm(vmp, vmp->vcpu_head->id); 262 + 263 + /* Close file descriptor for the VM. */ 264 + ret = close(vmp->fd); 265 + TEST_ASSERT(ret == 0, "Close of vm fd failed,\n" 266 + " vmp->fd: %i rc: %i errno: %i", vmp->fd, ret, errno); 267 + 268 + close(vmp->kvm_fd); 269 + TEST_ASSERT(ret == 0, "Close of /dev/kvm fd failed,\n" 270 + " vmp->kvm_fd: %i rc: %i errno: %i", vmp->kvm_fd, ret, errno); 271 + } 295 272 296 273 /* Destroys and frees the VM pointed to by vmp. 297 274 */ ··· 339 282 free(region); 340 283 } 341 284 342 - /* Free VCPUs. */ 343 - while (vmp->vcpu_head) 344 - vm_vcpu_rm(vmp, vmp->vcpu_head->id); 345 - 346 285 /* Free sparsebit arrays. */ 347 286 sparsebit_free(&vmp->vpages_valid); 348 287 sparsebit_free(&vmp->vpages_mapped); 349 288 350 - /* Close file descriptor for the VM. */ 351 - ret = close(vmp->fd); 352 - TEST_ASSERT(ret == 0, "Close of vm fd failed,\n" 353 - " vmp->fd: %i rc: %i errno: %i", vmp->fd, ret, errno); 289 + kvm_vm_release(vmp); 354 290 355 291 /* Free the structure describing the VM. */ 356 292 free(vmp); ··· 751 701 * Creates and adds to the VM specified by vm and virtual CPU with 752 702 * the ID given by vcpuid. 753 703 */ 754 - void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid) 704 + void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid, int pgd_memslot, int gdt_memslot) 755 705 { 756 706 struct vcpu *vcpu; 757 707 ··· 786 736 vcpu->next = vm->vcpu_head; 787 737 vm->vcpu_head = vcpu; 788 738 789 - vcpu_setup(vm, vcpuid); 739 + vcpu_setup(vm, vcpuid, pgd_memslot, gdt_memslot); 790 740 } 791 741 792 742 /* VM Virtual Address Unused Gap ··· 1007 957 ret = ioctl(vm->fd, KVM_CREATE_IRQCHIP, 0); 1008 958 TEST_ASSERT(ret == 0, "KVM_CREATE_IRQCHIP IOCTL failed, " 1009 959 "rc: %i errno: %i", ret, errno); 960 + 961 + vm->has_irqchip = true; 1010 962 } 1011 963 1012 964 /* VM VCPU State

+6 -1

tools/testing/selftests/kvm/lib/kvm_util_internal.h

··· 43 43 44 44 struct kvm_vm { 45 45 int mode; 46 + int kvm_fd; 46 47 int fd; 47 48 unsigned int page_size; 48 49 unsigned int page_shift; ··· 52 51 struct userspace_mem_region *userspace_mem_region_head; 53 52 struct sparsebit *vpages_valid; 54 53 struct sparsebit *vpages_mapped; 54 + 55 + bool has_irqchip; 55 56 bool pgd_created; 56 57 vm_paddr_t pgd; 58 + vm_vaddr_t gdt; 59 + vm_vaddr_t tss; 57 60 }; 58 61 59 62 struct vcpu *vcpu_find(struct kvm_vm *vm, 60 63 uint32_t vcpuid); 61 - void vcpu_setup(struct kvm_vm *vm, int vcpuid); 64 + void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_memslot); 62 65 void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent); 63 66 void regs_dump(FILE *stream, struct kvm_regs *regs, 64 67 uint8_t indent);

+72 -32

tools/testing/selftests/kvm/lib/vmx.c

··· 13 13 #include "x86.h" 14 14 #include "vmx.h" 15 15 16 - /* Create a default VM for VMX tests. 16 + /* Allocate memory regions for nested VMX tests. 17 17 * 18 18 * Input Args: 19 - * vcpuid - The id of the single VCPU to add to the VM. 20 - * guest_code - The vCPU's entry point 19 + * vm - The VM to allocate guest-virtual addresses in. 21 20 * 22 - * Output Args: None 21 + * Output Args: 22 + * p_vmx_gva - The guest virtual address for the struct vmx_pages. 23 23 * 24 24 * Return: 25 - * Pointer to opaque structure that describes the created VM. 25 + * Pointer to structure with the addresses of the VMX areas. 26 26 */ 27 - struct kvm_vm * 28 - vm_create_default_vmx(uint32_t vcpuid, vmx_guest_code_t guest_code) 27 + struct vmx_pages * 28 + vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva) 29 29 { 30 - struct kvm_cpuid2 *cpuid; 31 - struct kvm_vm *vm; 32 - vm_vaddr_t vmxon_vaddr; 33 - vm_paddr_t vmxon_paddr; 34 - vm_vaddr_t vmcs_vaddr; 35 - vm_paddr_t vmcs_paddr; 36 - 37 - vm = vm_create_default(vcpuid, (void *) guest_code); 38 - 39 - /* Enable nesting in CPUID */ 40 - vcpu_set_cpuid(vm, vcpuid, kvm_get_supported_cpuid()); 30 + vm_vaddr_t vmx_gva = vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); 31 + struct vmx_pages *vmx = addr_gva2hva(vm, vmx_gva); 41 32 42 33 /* Setup of a region of guest memory for the vmxon region. */ 43 - vmxon_vaddr = vm_vaddr_alloc(vm, getpagesize(), 0, 0, 0); 44 - vmxon_paddr = addr_gva2gpa(vm, vmxon_vaddr); 34 + vmx->vmxon = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); 35 + vmx->vmxon_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmxon); 36 + vmx->vmxon_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmxon); 45 37 46 38 /* Setup of a region of guest memory for a vmcs. */ 47 - vmcs_vaddr = vm_vaddr_alloc(vm, getpagesize(), 0, 0, 0); 48 - vmcs_paddr = addr_gva2gpa(vm, vmcs_vaddr); 39 + vmx->vmcs = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); 40 + vmx->vmcs_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmcs); 41 + vmx->vmcs_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmcs); 49 42 50 - vcpu_args_set(vm, vcpuid, 4, vmxon_vaddr, vmxon_paddr, vmcs_vaddr, 51 - vmcs_paddr); 43 + /* Setup of a region of guest memory for the MSR bitmap. */ 44 + vmx->msr = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); 45 + vmx->msr_hva = addr_gva2hva(vm, (uintptr_t)vmx->msr); 46 + vmx->msr_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->msr); 47 + memset(vmx->msr_hva, 0, getpagesize()); 52 48 53 - return vm; 49 + /* Setup of a region of guest memory for the shadow VMCS. */ 50 + vmx->shadow_vmcs = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); 51 + vmx->shadow_vmcs_hva = addr_gva2hva(vm, (uintptr_t)vmx->shadow_vmcs); 52 + vmx->shadow_vmcs_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->shadow_vmcs); 53 + 54 + /* Setup of a region of guest memory for the VMREAD and VMWRITE bitmaps. */ 55 + vmx->vmread = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); 56 + vmx->vmread_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmread); 57 + vmx->vmread_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmread); 58 + memset(vmx->vmread_hva, 0, getpagesize()); 59 + 60 + vmx->vmwrite = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); 61 + vmx->vmwrite_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmwrite); 62 + vmx->vmwrite_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmwrite); 63 + memset(vmx->vmwrite_hva, 0, getpagesize()); 64 + 65 + *p_vmx_gva = vmx_gva; 66 + return vmx; 54 67 } 55 68 56 - void prepare_for_vmx_operation(void) 69 + bool prepare_for_vmx_operation(struct vmx_pages *vmx) 57 70 { 58 71 uint64_t feature_control; 59 72 uint64_t required; ··· 101 88 feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); 102 89 if ((feature_control & required) != required) 103 90 wrmsr(MSR_IA32_FEATURE_CONTROL, feature_control | required); 91 + 92 + /* Enter VMX root operation. */ 93 + *(uint32_t *)(vmx->vmxon) = vmcs_revision(); 94 + if (vmxon(vmx->vmxon_gpa)) 95 + return false; 96 + 97 + /* Load a VMCS. */ 98 + *(uint32_t *)(vmx->vmcs) = vmcs_revision(); 99 + if (vmclear(vmx->vmcs_gpa)) 100 + return false; 101 + 102 + if (vmptrld(vmx->vmcs_gpa)) 103 + return false; 104 + 105 + /* Setup shadow VMCS, do not load it yet. */ 106 + *(uint32_t *)(vmx->shadow_vmcs) = vmcs_revision() | 0x80000000ul; 107 + if (vmclear(vmx->shadow_vmcs_gpa)) 108 + return false; 109 + 110 + return true; 104 111 } 105 112 106 113 /* 107 114 * Initialize the control fields to the most basic settings possible. 108 115 */ 109 - static inline void init_vmcs_control_fields(void) 116 + static inline void init_vmcs_control_fields(struct vmx_pages *vmx) 110 117 { 111 118 vmwrite(VIRTUAL_PROCESSOR_ID, 0); 112 119 vmwrite(POSTED_INTR_NV, 0); 113 120 114 - vmwrite(PIN_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_PINBASED_CTLS)); 115 - vmwrite(CPU_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_PROCBASED_CTLS)); 121 + vmwrite(PIN_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_TRUE_PINBASED_CTLS)); 122 + if (!vmwrite(SECONDARY_VM_EXEC_CONTROL, 0)) 123 + vmwrite(CPU_BASED_VM_EXEC_CONTROL, 124 + rdmsr(MSR_IA32_VMX_TRUE_PROCBASED_CTLS) | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS); 125 + else 126 + vmwrite(CPU_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_TRUE_PROCBASED_CTLS)); 116 127 vmwrite(EXCEPTION_BITMAP, 0); 117 128 vmwrite(PAGE_FAULT_ERROR_CODE_MASK, 0); 118 129 vmwrite(PAGE_FAULT_ERROR_CODE_MATCH, -1); /* Never match */ ··· 150 113 vmwrite(VM_ENTRY_MSR_LOAD_COUNT, 0); 151 114 vmwrite(VM_ENTRY_INTR_INFO_FIELD, 0); 152 115 vmwrite(TPR_THRESHOLD, 0); 153 - vmwrite(SECONDARY_VM_EXEC_CONTROL, 0); 154 116 155 117 vmwrite(CR0_GUEST_HOST_MASK, 0); 156 118 vmwrite(CR4_GUEST_HOST_MASK, 0); 157 119 vmwrite(CR0_READ_SHADOW, get_cr0()); 158 120 vmwrite(CR4_READ_SHADOW, get_cr4()); 121 + 122 + vmwrite(MSR_BITMAP, vmx->msr_gpa); 123 + vmwrite(VMREAD_BITMAP, vmx->vmread_gpa); 124 + vmwrite(VMWRITE_BITMAP, vmx->vmwrite_gpa); 159 125 } 160 126 161 127 /* ··· 275 235 vmwrite(GUEST_SYSENTER_EIP, vmreadz(HOST_IA32_SYSENTER_EIP)); 276 236 } 277 237 278 - void prepare_vmcs(void *guest_rip, void *guest_rsp) 238 + void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp) 279 239 { 280 - init_vmcs_control_fields(); 240 + init_vmcs_control_fields(vmx); 281 241 init_vmcs_host_state(); 282 242 init_vmcs_guest_state(guest_rip, guest_rsp); 283 243 }

+215 -41

tools/testing/selftests/kvm/lib/x86.c

··· 239 239 vm_paddr_t paddr = vm_phy_page_alloc(vm, 240 240 KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot); 241 241 vm->pgd = paddr; 242 - 243 - /* Set pointer to pgd tables in all the VCPUs that 244 - * have already been created. Future VCPUs will have 245 - * the value set as each one is created. 246 - */ 247 - for (struct vcpu *vcpu = vm->vcpu_head; vcpu; 248 - vcpu = vcpu->next) { 249 - struct kvm_sregs sregs; 250 - 251 - /* Obtain the current system register settings */ 252 - vcpu_sregs_get(vm, vcpu->id, &sregs); 253 - 254 - /* Set and store the pointer to the start of the 255 - * pgd tables. 256 - */ 257 - sregs.cr3 = vm->pgd; 258 - vcpu_sregs_set(vm, vcpu->id, &sregs); 259 - } 260 - 261 242 vm->pgd_created = true; 262 243 } 263 244 } ··· 441 460 segp->unusable = true; 442 461 } 443 462 463 + static void kvm_seg_fill_gdt_64bit(struct kvm_vm *vm, struct kvm_segment *segp) 464 + { 465 + void *gdt = addr_gva2hva(vm, vm->gdt); 466 + struct desc64 *desc = gdt + (segp->selector >> 3) * 8; 467 + 468 + desc->limit0 = segp->limit & 0xFFFF; 469 + desc->base0 = segp->base & 0xFFFF; 470 + desc->base1 = segp->base >> 16; 471 + desc->s = segp->s; 472 + desc->type = segp->type; 473 + desc->dpl = segp->dpl; 474 + desc->p = segp->present; 475 + desc->limit1 = segp->limit >> 16; 476 + desc->l = segp->l; 477 + desc->db = segp->db; 478 + desc->g = segp->g; 479 + desc->base2 = segp->base >> 24; 480 + if (!segp->s) 481 + desc->base3 = segp->base >> 32; 482 + } 483 + 484 + 444 485 /* Set Long Mode Flat Kernel Code Segment 445 486 * 446 487 * Input Args: 488 + * vm - VM whose GDT is being filled, or NULL to only write segp 447 489 * selector - selector value 448 490 * 449 491 * Output Args: ··· 477 473 * Sets up the KVM segment pointed to by segp, to be a code segment 478 474 * with the selector value given by selector. 479 475 */ 480 - static void kvm_seg_set_kernel_code_64bit(uint16_t selector, 476 + static void kvm_seg_set_kernel_code_64bit(struct kvm_vm *vm, uint16_t selector, 481 477 struct kvm_segment *segp) 482 478 { 483 479 memset(segp, 0, sizeof(*segp)); ··· 490 486 segp->g = true; 491 487 segp->l = true; 492 488 segp->present = 1; 489 + if (vm) 490 + kvm_seg_fill_gdt_64bit(vm, segp); 493 491 } 494 492 495 493 /* Set Long Mode Flat Kernel Data Segment 496 494 * 497 495 * Input Args: 496 + * vm - VM whose GDT is being filled, or NULL to only write segp 498 497 * selector - selector value 499 498 * 500 499 * Output Args: ··· 508 501 * Sets up the KVM segment pointed to by segp, to be a data segment 509 502 * with the selector value given by selector. 510 503 */ 511 - static void kvm_seg_set_kernel_data_64bit(uint16_t selector, 504 + static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector, 512 505 struct kvm_segment *segp) 513 506 { 514 507 memset(segp, 0, sizeof(*segp)); ··· 520 513 */ 521 514 segp->g = true; 522 515 segp->present = true; 516 + if (vm) 517 + kvm_seg_fill_gdt_64bit(vm, segp); 523 518 } 524 519 525 520 /* Address Guest Virtual to Guest Physical ··· 584 575 "gva: 0x%lx", gva); 585 576 } 586 577 587 - void vcpu_setup(struct kvm_vm *vm, int vcpuid) 578 + static void kvm_setup_gdt(struct kvm_vm *vm, struct kvm_dtable *dt, int gdt_memslot, 579 + int pgd_memslot) 580 + { 581 + if (!vm->gdt) 582 + vm->gdt = vm_vaddr_alloc(vm, getpagesize(), 583 + KVM_UTIL_MIN_VADDR, gdt_memslot, pgd_memslot); 584 + 585 + dt->base = vm->gdt; 586 + dt->limit = getpagesize(); 587 + } 588 + 589 + static void kvm_setup_tss_64bit(struct kvm_vm *vm, struct kvm_segment *segp, 590 + int selector, int gdt_memslot, 591 + int pgd_memslot) 592 + { 593 + if (!vm->tss) 594 + vm->tss = vm_vaddr_alloc(vm, getpagesize(), 595 + KVM_UTIL_MIN_VADDR, gdt_memslot, pgd_memslot); 596 + 597 + memset(segp, 0, sizeof(*segp)); 598 + segp->base = vm->tss; 599 + segp->limit = 0x67; 600 + segp->selector = selector; 601 + segp->type = 0xb; 602 + segp->present = 1; 603 + kvm_seg_fill_gdt_64bit(vm, segp); 604 + } 605 + 606 + void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_memslot) 588 607 { 589 608 struct kvm_sregs sregs; 590 609 591 610 /* Set mode specific system register values. */ 592 611 vcpu_sregs_get(vm, vcpuid, &sregs); 612 + 613 + sregs.idt.limit = 0; 614 + 615 + kvm_setup_gdt(vm, &sregs.gdt, gdt_memslot, pgd_memslot); 593 616 594 617 switch (vm->mode) { 595 618 case VM_MODE_FLAT48PG: ··· 630 589 sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX); 631 590 632 591 kvm_seg_set_unusable(&sregs.ldt); 633 - kvm_seg_set_kernel_code_64bit(0x8, &sregs.cs); 634 - kvm_seg_set_kernel_data_64bit(0x10, &sregs.ds); 635 - kvm_seg_set_kernel_data_64bit(0x10, &sregs.es); 592 + kvm_seg_set_kernel_code_64bit(vm, 0x8, &sregs.cs); 593 + kvm_seg_set_kernel_data_64bit(vm, 0x10, &sregs.ds); 594 + kvm_seg_set_kernel_data_64bit(vm, 0x10, &sregs.es); 595 + kvm_setup_tss_64bit(vm, &sregs.tr, 0x18, gdt_memslot, pgd_memslot); 636 596 break; 637 597 638 598 default: 639 599 TEST_ASSERT(false, "Unknown guest mode, mode: 0x%x", vm->mode); 640 600 } 601 + 602 + sregs.cr3 = vm->pgd; 641 603 vcpu_sregs_set(vm, vcpuid, &sregs); 642 - 643 - /* If virtual translation table have been setup, set system register 644 - * to point to the tables. It's okay if they haven't been setup yet, 645 - * in that the code that sets up the virtual translation tables, will 646 - * go back through any VCPUs that have already been created and set 647 - * their values. 648 - */ 649 - if (vm->pgd_created) { 650 - struct kvm_sregs sregs; 651 - 652 - vcpu_sregs_get(vm, vcpuid, &sregs); 653 - 654 - sregs.cr3 = vm->pgd; 655 - vcpu_sregs_set(vm, vcpuid, &sregs); 656 - } 657 604 } 658 605 /* Adds a vCPU with reasonable defaults (i.e., a stack) 659 606 * ··· 658 629 DEFAULT_GUEST_STACK_VADDR_MIN, 0, 0); 659 630 660 631 /* Create VCPU */ 661 - vm_vcpu_add(vm, vcpuid); 632 + vm_vcpu_add(vm, vcpuid, 0, 0); 662 633 663 634 /* Setup guest general purpose registers */ 664 635 vcpu_regs_get(vm, vcpuid, &regs); ··· 726 697 vm_vcpu_add_default(vm, vcpuid, guest_code); 727 698 728 699 return vm; 700 + } 701 + 702 + struct kvm_x86_state { 703 + struct kvm_vcpu_events events; 704 + struct kvm_mp_state mp_state; 705 + struct kvm_regs regs; 706 + struct kvm_xsave xsave; 707 + struct kvm_xcrs xcrs; 708 + struct kvm_sregs sregs; 709 + struct kvm_debugregs debugregs; 710 + union { 711 + struct kvm_nested_state nested; 712 + char nested_[16384]; 713 + }; 714 + struct kvm_msrs msrs; 715 + }; 716 + 717 + static int kvm_get_num_msrs(struct kvm_vm *vm) 718 + { 719 + struct kvm_msr_list nmsrs; 720 + int r; 721 + 722 + nmsrs.nmsrs = 0; 723 + r = ioctl(vm->kvm_fd, KVM_GET_MSR_INDEX_LIST, &nmsrs); 724 + TEST_ASSERT(r == -1 && errno == E2BIG, "Unexpected result from KVM_GET_MSR_INDEX_LIST probe, r: %i", 725 + r); 726 + 727 + return nmsrs.nmsrs; 728 + } 729 + 730 + struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid) 731 + { 732 + struct vcpu *vcpu = vcpu_find(vm, vcpuid); 733 + struct kvm_msr_list *list; 734 + struct kvm_x86_state *state; 735 + int nmsrs, r, i; 736 + static int nested_size = -1; 737 + 738 + if (nested_size == -1) { 739 + nested_size = kvm_check_cap(KVM_CAP_NESTED_STATE); 740 + TEST_ASSERT(nested_size <= sizeof(state->nested_), 741 + "Nested state size too big, %i > %zi", 742 + nested_size, sizeof(state->nested_)); 743 + } 744 + 745 + nmsrs = kvm_get_num_msrs(vm); 746 + list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0])); 747 + list->nmsrs = nmsrs; 748 + r = ioctl(vm->kvm_fd, KVM_GET_MSR_INDEX_LIST, list); 749 + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MSR_INDEX_LIST, r: %i", 750 + r); 751 + 752 + state = malloc(sizeof(*state) + nmsrs * sizeof(state->msrs.entries[0])); 753 + r = ioctl(vcpu->fd, KVM_GET_VCPU_EVENTS, &state->events); 754 + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_VCPU_EVENTS, r: %i", 755 + r); 756 + 757 + r = ioctl(vcpu->fd, KVM_GET_MP_STATE, &state->mp_state); 758 + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MP_STATE, r: %i", 759 + r); 760 + 761 + r = ioctl(vcpu->fd, KVM_GET_REGS, &state->regs); 762 + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_REGS, r: %i", 763 + r); 764 + 765 + r = ioctl(vcpu->fd, KVM_GET_XSAVE, &state->xsave); 766 + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XSAVE, r: %i", 767 + r); 768 + 769 + r = ioctl(vcpu->fd, KVM_GET_XCRS, &state->xcrs); 770 + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XCRS, r: %i", 771 + r); 772 + 773 + r = ioctl(vcpu->fd, KVM_GET_SREGS, &state->sregs); 774 + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_SREGS, r: %i", 775 + r); 776 + 777 + if (nested_size) { 778 + state->nested.size = sizeof(state->nested_); 779 + r = ioctl(vcpu->fd, KVM_GET_NESTED_STATE, &state->nested); 780 + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_NESTED_STATE, r: %i", 781 + r); 782 + TEST_ASSERT(state->nested.size <= nested_size, 783 + "Nested state size too big, %i (KVM_CHECK_CAP gave %i)", 784 + state->nested.size, nested_size); 785 + } else 786 + state->nested.size = 0; 787 + 788 + state->msrs.nmsrs = nmsrs; 789 + for (i = 0; i < nmsrs; i++) 790 + state->msrs.entries[i].index = list->indices[i]; 791 + r = ioctl(vcpu->fd, KVM_GET_MSRS, &state->msrs); 792 + TEST_ASSERT(r == nmsrs, "Unexpected result from KVM_GET_MSRS, r: %i (failed at %x)", 793 + r, r == nmsrs ? -1 : list->indices[r]); 794 + 795 + r = ioctl(vcpu->fd, KVM_GET_DEBUGREGS, &state->debugregs); 796 + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_DEBUGREGS, r: %i", 797 + r); 798 + 799 + free(list); 800 + return state; 801 + } 802 + 803 + void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *state) 804 + { 805 + struct vcpu *vcpu = vcpu_find(vm, vcpuid); 806 + int r; 807 + 808 + if (state->nested.size) { 809 + r = ioctl(vcpu->fd, KVM_SET_NESTED_STATE, &state->nested); 810 + TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_NESTED_STATE, r: %i", 811 + r); 812 + } 813 + 814 + r = ioctl(vcpu->fd, KVM_SET_XSAVE, &state->xsave); 815 + TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XSAVE, r: %i", 816 + r); 817 + 818 + r = ioctl(vcpu->fd, KVM_SET_XCRS, &state->xcrs); 819 + TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XCRS, r: %i", 820 + r); 821 + 822 + r = ioctl(vcpu->fd, KVM_SET_SREGS, &state->sregs); 823 + TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_SREGS, r: %i", 824 + r); 825 + 826 + r = ioctl(vcpu->fd, KVM_SET_MSRS, &state->msrs); 827 + TEST_ASSERT(r == state->msrs.nmsrs, "Unexpected result from KVM_SET_MSRS, r: %i (failed at %x)", 828 + r, r == state->msrs.nmsrs ? -1 : state->msrs.entries[r].index); 829 + 830 + r = ioctl(vcpu->fd, KVM_SET_VCPU_EVENTS, &state->events); 831 + TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_VCPU_EVENTS, r: %i", 832 + r); 833 + 834 + r = ioctl(vcpu->fd, KVM_SET_MP_STATE, &state->mp_state); 835 + TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_MP_STATE, r: %i", 836 + r); 837 + 838 + r = ioctl(vcpu->fd, KVM_SET_DEBUGREGS, &state->debugregs); 839 + TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_DEBUGREGS, r: %i", 840 + r); 841 + 842 + r = ioctl(vcpu->fd, KVM_SET_REGS, &state->regs); 843 + TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_REGS, r: %i", 844 + r); 729 845 }

+218

tools/testing/selftests/kvm/state_test.c

··· 1 + /* 2 + * KVM_GET/SET_* tests 3 + * 4 + * Copyright (C) 2018, Red Hat, Inc. 5 + * 6 + * This work is licensed under the terms of the GNU GPL, version 2. 7 + * 8 + * Tests for vCPU state save/restore, including nested guest state. 9 + */ 10 + #define _GNU_SOURCE /* for program_invocation_short_name */ 11 + #include <fcntl.h> 12 + #include <stdio.h> 13 + #include <stdlib.h> 14 + #include <string.h> 15 + #include <sys/ioctl.h> 16 + 17 + #include "test_util.h" 18 + 19 + #include "kvm_util.h" 20 + #include "x86.h" 21 + #include "vmx.h" 22 + 23 + #define VCPU_ID 5 24 + #define PORT_SYNC 0x1000 25 + #define PORT_ABORT 0x1001 26 + #define PORT_DONE 0x1002 27 + 28 + static inline void __exit_to_l0(uint16_t port, uint64_t arg0, uint64_t arg1) 29 + { 30 + __asm__ __volatile__("in %[port], %%al" 31 + : 32 + : [port]"d"(port), "D"(arg0), "S"(arg1) 33 + : "rax"); 34 + } 35 + 36 + #define exit_to_l0(_port, _arg0, _arg1) \ 37 + __exit_to_l0(_port, (uint64_t) (_arg0), (uint64_t) (_arg1)) 38 + 39 + #define GUEST_ASSERT(_condition) do { \ 40 + if (!(_condition)) \ 41 + exit_to_l0(PORT_ABORT, "Failed guest assert: " #_condition, __LINE__);\ 42 + } while (0) 43 + 44 + #define GUEST_SYNC(stage) \ 45 + exit_to_l0(PORT_SYNC, "hello", stage); 46 + 47 + static bool have_nested_state; 48 + 49 + void l2_guest_code(void) 50 + { 51 + GUEST_SYNC(5); 52 + 53 + /* Exit to L1 */ 54 + vmcall(); 55 + 56 + /* L1 has now set up a shadow VMCS for us. */ 57 + GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffee); 58 + GUEST_SYNC(9); 59 + GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffee); 60 + GUEST_ASSERT(!vmwrite(GUEST_RIP, 0xc0fffee)); 61 + GUEST_SYNC(10); 62 + GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0fffee); 63 + GUEST_ASSERT(!vmwrite(GUEST_RIP, 0xc0ffffee)); 64 + GUEST_SYNC(11); 65 + 66 + /* Done, exit to L1 and never come back. */ 67 + vmcall(); 68 + } 69 + 70 + void l1_guest_code(struct vmx_pages *vmx_pages) 71 + { 72 + #define L2_GUEST_STACK_SIZE 64 73 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; 74 + 75 + GUEST_ASSERT(vmx_pages->vmcs_gpa); 76 + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); 77 + GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa); 78 + 79 + GUEST_SYNC(3); 80 + GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa); 81 + 82 + prepare_vmcs(vmx_pages, l2_guest_code, 83 + &l2_guest_stack[L2_GUEST_STACK_SIZE]); 84 + 85 + GUEST_SYNC(4); 86 + GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa); 87 + GUEST_ASSERT(!vmlaunch()); 88 + GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa); 89 + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); 90 + 91 + /* Check that the launched state is preserved. */ 92 + GUEST_ASSERT(vmlaunch()); 93 + 94 + GUEST_ASSERT(!vmresume()); 95 + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); 96 + 97 + GUEST_SYNC(6); 98 + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); 99 + 100 + GUEST_ASSERT(!vmresume()); 101 + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); 102 + 103 + vmwrite(GUEST_RIP, vmreadz(GUEST_RIP) + 3); 104 + 105 + vmwrite(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS); 106 + vmwrite(VMCS_LINK_POINTER, vmx_pages->shadow_vmcs_gpa); 107 + 108 + GUEST_ASSERT(!vmptrld(vmx_pages->shadow_vmcs_gpa)); 109 + GUEST_ASSERT(vmlaunch()); 110 + GUEST_SYNC(7); 111 + GUEST_ASSERT(vmlaunch()); 112 + GUEST_ASSERT(vmresume()); 113 + 114 + vmwrite(GUEST_RIP, 0xc0ffee); 115 + GUEST_SYNC(8); 116 + GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffee); 117 + 118 + GUEST_ASSERT(!vmptrld(vmx_pages->vmcs_gpa)); 119 + GUEST_ASSERT(!vmresume()); 120 + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); 121 + 122 + GUEST_ASSERT(!vmptrld(vmx_pages->shadow_vmcs_gpa)); 123 + GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffffee); 124 + GUEST_ASSERT(vmlaunch()); 125 + GUEST_ASSERT(vmresume()); 126 + GUEST_SYNC(12); 127 + GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffffee); 128 + GUEST_ASSERT(vmlaunch()); 129 + GUEST_ASSERT(vmresume()); 130 + } 131 + 132 + void guest_code(struct vmx_pages *vmx_pages) 133 + { 134 + GUEST_SYNC(1); 135 + GUEST_SYNC(2); 136 + 137 + if (vmx_pages) 138 + l1_guest_code(vmx_pages); 139 + 140 + exit_to_l0(PORT_DONE, 0, 0); 141 + } 142 + 143 + int main(int argc, char *argv[]) 144 + { 145 + struct vmx_pages *vmx_pages = NULL; 146 + vm_vaddr_t vmx_pages_gva = 0; 147 + 148 + struct kvm_regs regs1, regs2; 149 + struct kvm_vm *vm; 150 + struct kvm_run *run; 151 + struct kvm_x86_state *state; 152 + int stage; 153 + 154 + struct kvm_cpuid_entry2 *entry = kvm_get_supported_cpuid_entry(1); 155 + 156 + /* Create VM */ 157 + vm = vm_create_default(VCPU_ID, guest_code); 158 + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); 159 + run = vcpu_state(vm, VCPU_ID); 160 + 161 + vcpu_regs_get(vm, VCPU_ID, &regs1); 162 + 163 + if (kvm_check_cap(KVM_CAP_NESTED_STATE)) { 164 + vmx_pages = vcpu_alloc_vmx(vm, &vmx_pages_gva); 165 + vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); 166 + } else { 167 + printf("will skip nested state checks\n"); 168 + vcpu_args_set(vm, VCPU_ID, 1, 0); 169 + } 170 + 171 + for (stage = 1;; stage++) { 172 + _vcpu_run(vm, VCPU_ID); 173 + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, 174 + "Unexpected exit reason: %u (%s),\n", 175 + run->exit_reason, 176 + exit_reason_str(run->exit_reason)); 177 + 178 + memset(&regs1, 0, sizeof(regs1)); 179 + vcpu_regs_get(vm, VCPU_ID, &regs1); 180 + switch (run->io.port) { 181 + case PORT_ABORT: 182 + TEST_ASSERT(false, "%s at %s:%d", (const char *) regs1.rdi, 183 + __FILE__, regs1.rsi); 184 + /* NOT REACHED */ 185 + case PORT_SYNC: 186 + break; 187 + case PORT_DONE: 188 + goto done; 189 + default: 190 + TEST_ASSERT(false, "Unknown port 0x%x.", run->io.port); 191 + } 192 + 193 + /* PORT_SYNC is handled here. */ 194 + TEST_ASSERT(!strcmp((const char *)regs1.rdi, "hello") && 195 + regs1.rsi == stage, "Unexpected register values vmexit #%lx, got %lx", 196 + stage, (ulong) regs1.rsi); 197 + 198 + state = vcpu_save_state(vm, VCPU_ID); 199 + kvm_vm_release(vm); 200 + 201 + /* Restore state in a new VM. */ 202 + kvm_vm_restart(vm, O_RDWR); 203 + vm_vcpu_add(vm, VCPU_ID, 0, 0); 204 + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); 205 + vcpu_load_state(vm, VCPU_ID, state); 206 + run = vcpu_state(vm, VCPU_ID); 207 + free(state); 208 + 209 + memset(&regs2, 0, sizeof(regs2)); 210 + vcpu_regs_get(vm, VCPU_ID, &regs2); 211 + TEST_ASSERT(!memcmp(&regs1, &regs2, sizeof(regs2)), 212 + "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx", 213 + (ulong) regs2.rdi, (ulong) regs2.rsi); 214 + } 215 + 216 + done: 217 + kvm_vm_free(vm); 218 + }

+14 -57

tools/testing/selftests/kvm/vmx_tsc_adjust_test.c

··· 46 46 PORT_DONE, 47 47 }; 48 48 49 - struct vmx_page { 50 - vm_vaddr_t virt; 51 - vm_paddr_t phys; 52 - }; 53 - 54 49 enum { 55 50 VMXON_PAGE = 0, 56 51 VMCS_PAGE, ··· 61 66 62 67 /* The virtual machine object. */ 63 68 static struct kvm_vm *vm; 64 - 65 - /* Array of vmx_page descriptors that is shared with the guest. */ 66 - struct vmx_page *vmx_pages; 67 69 68 70 #define exit_to_l0(_port, _arg) do_exit_to_l0(_port, (unsigned long) (_arg)) 69 71 static void do_exit_to_l0(uint16_t port, unsigned long arg) ··· 97 105 __asm__ __volatile__("vmcall"); 98 106 } 99 107 100 - static void l1_guest_code(struct vmx_page *vmx_pages) 108 + static void l1_guest_code(struct vmx_pages *vmx_pages) 101 109 { 102 110 #define L2_GUEST_STACK_SIZE 64 103 111 unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; ··· 108 116 wrmsr(MSR_IA32_TSC, rdtsc() - TSC_ADJUST_VALUE); 109 117 check_ia32_tsc_adjust(-1 * TSC_ADJUST_VALUE); 110 118 111 - prepare_for_vmx_operation(); 112 - 113 - /* Enter VMX root operation. */ 114 - *(uint32_t *)vmx_pages[VMXON_PAGE].virt = vmcs_revision(); 115 - GUEST_ASSERT(!vmxon(vmx_pages[VMXON_PAGE].phys)); 116 - 117 - /* Load a VMCS. */ 118 - *(uint32_t *)vmx_pages[VMCS_PAGE].virt = vmcs_revision(); 119 - GUEST_ASSERT(!vmclear(vmx_pages[VMCS_PAGE].phys)); 120 - GUEST_ASSERT(!vmptrld(vmx_pages[VMCS_PAGE].phys)); 119 + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); 121 120 122 121 /* Prepare the VMCS for L2 execution. */ 123 - prepare_vmcs(l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); 122 + prepare_vmcs(vmx_pages, l2_guest_code, 123 + &l2_guest_stack[L2_GUEST_STACK_SIZE]); 124 124 control = vmreadz(CPU_BASED_VM_EXEC_CONTROL); 125 125 control |= CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_TSC_OFFSETING; 126 126 vmwrite(CPU_BASED_VM_EXEC_CONTROL, control); 127 - vmwrite(MSR_BITMAP, vmx_pages[MSR_BITMAP_PAGE].phys); 128 127 vmwrite(TSC_OFFSET, TSC_OFFSET_VALUE); 129 128 130 129 /* Jump into L2. First, test failure to load guest CR3. */ ··· 135 152 exit_to_l0(PORT_DONE, 0); 136 153 } 137 154 138 - static void allocate_vmx_page(struct vmx_page *page) 139 - { 140 - vm_vaddr_t virt; 141 - 142 - virt = vm_vaddr_alloc(vm, PAGE_SIZE, 0, 0, 0); 143 - memset(addr_gva2hva(vm, virt), 0, PAGE_SIZE); 144 - 145 - page->virt = virt; 146 - page->phys = addr_gva2gpa(vm, virt); 147 - } 148 - 149 - static vm_vaddr_t allocate_vmx_pages(void) 150 - { 151 - vm_vaddr_t vmx_pages_vaddr; 152 - int i; 153 - 154 - vmx_pages_vaddr = vm_vaddr_alloc( 155 - vm, sizeof(struct vmx_page) * NUM_VMX_PAGES, 0, 0, 0); 156 - 157 - vmx_pages = (void *) addr_gva2hva(vm, vmx_pages_vaddr); 158 - 159 - for (i = 0; i < NUM_VMX_PAGES; i++) 160 - allocate_vmx_page(&vmx_pages[i]); 161 - 162 - return vmx_pages_vaddr; 163 - } 164 - 165 155 void report(int64_t val) 166 156 { 167 157 printf("IA32_TSC_ADJUST is %ld (%lld * TSC_ADJUST_VALUE + %lld).\n", ··· 143 187 144 188 int main(int argc, char *argv[]) 145 189 { 146 - vm_vaddr_t vmx_pages_vaddr; 190 + struct vmx_pages *vmx_pages; 191 + vm_vaddr_t vmx_pages_gva; 147 192 struct kvm_cpuid_entry2 *entry = kvm_get_supported_cpuid_entry(1); 148 193 149 194 if (!(entry->ecx & CPUID_VMX)) { ··· 152 195 exit(KSFT_SKIP); 153 196 } 154 197 155 - vm = vm_create_default_vmx(VCPU_ID, (void *) l1_guest_code); 198 + vm = vm_create_default(VCPU_ID, (void *) l1_guest_code); 199 + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); 156 200 157 201 /* Allocate VMX pages and shared descriptors (vmx_pages). */ 158 - vmx_pages_vaddr = allocate_vmx_pages(); 159 - vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_vaddr); 202 + vmx_pages = vcpu_alloc_vmx(vm, &vmx_pages_gva); 203 + vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); 160 204 161 205 for (;;) { 162 206 volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID); 163 207 struct kvm_regs regs; 164 208 165 209 vcpu_run(vm, VCPU_ID); 166 - TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, 167 - "Got exit_reason other than KVM_EXIT_IO: %u (%s),\n", 168 - run->exit_reason, 169 - exit_reason_str(run->exit_reason)); 170 - 171 210 vcpu_regs_get(vm, VCPU_ID, &regs); 211 + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, 212 + "Got exit_reason other than KVM_EXIT_IO: %u (%s), rip=%lx\n", 213 + run->exit_reason, 214 + exit_reason_str(run->exit_reason), regs.rip); 172 215 173 216 switch (run->io.port) { 174 217 case PORT_ABORT:

+19 -14

virt/kvm/kvm_main.c

··· 273 273 * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that 274 274 * barrier here. 275 275 */ 276 - if (kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) 276 + if (!kvm_arch_flush_remote_tlb(kvm) 277 + || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) 277 278 ++kvm->stat.remote_tlb_flush; 278 279 cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); 279 280 } ··· 1170 1169 1171 1170 n = kvm_dirty_bitmap_bytes(memslot); 1172 1171 1173 - dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long); 1172 + dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); 1174 1173 memset(dirty_bitmap_buffer, 0, n); 1175 1174 1176 1175 spin_lock(&kvm->mmu_lock); ··· 1343 1342 } 1344 1343 1345 1344 /* 1346 - * The atomic path to get the writable pfn which will be stored in @pfn, 1347 - * true indicates success, otherwise false is returned. 1345 + * The fast path to get the writable pfn which will be stored in @pfn, 1346 + * true indicates success, otherwise false is returned. It's also the 1347 + * only part that runs if we can are in atomic context. 1348 1348 */ 1349 - static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async, 1350 - bool write_fault, bool *writable, kvm_pfn_t *pfn) 1349 + static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, 1350 + bool *writable, kvm_pfn_t *pfn) 1351 1351 { 1352 1352 struct page *page[1]; 1353 1353 int npages; 1354 - 1355 - if (!(async || atomic)) 1356 - return false; 1357 1354 1358 1355 /* 1359 1356 * Fast pin a writable pfn only if it is a write fault request ··· 1496 1497 /* we can do it either atomically or asynchronously, not both */ 1497 1498 BUG_ON(atomic && async); 1498 1499 1499 - if (hva_to_pfn_fast(addr, atomic, async, write_fault, writable, &pfn)) 1500 + if (hva_to_pfn_fast(addr, write_fault, writable, &pfn)) 1500 1501 return pfn; 1501 1502 1502 1503 if (atomic) ··· 2126 2127 2127 2128 static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu) 2128 2129 { 2130 + int ret = -EINTR; 2131 + int idx = srcu_read_lock(&vcpu->kvm->srcu); 2132 + 2129 2133 if (kvm_arch_vcpu_runnable(vcpu)) { 2130 2134 kvm_make_request(KVM_REQ_UNHALT, vcpu); 2131 - return -EINTR; 2135 + goto out; 2132 2136 } 2133 2137 if (kvm_cpu_has_pending_timer(vcpu)) 2134 - return -EINTR; 2138 + goto out; 2135 2139 if (signal_pending(current)) 2136 - return -EINTR; 2140 + goto out; 2137 2141 2138 - return 0; 2142 + ret = 0; 2143 + out: 2144 + srcu_read_unlock(&vcpu->kvm->srcu, idx); 2145 + return ret; 2139 2146 } 2140 2147 2141 2148 /*