Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

+2

Documentation/virt/kvm/api.rst

··· 4519 4519 leaves (0x40000000, 0x40000001). 4520 4520 4521 4521 Currently, the following list of CPUID leaves are returned: 4522 + 4522 4523 - HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS 4523 4524 - HYPERV_CPUID_INTERFACE 4524 4525 - HYPERV_CPUID_VERSION ··· 4544 4543 Note, vcpu version of KVM_GET_SUPPORTED_HV_CPUID is currently deprecated. Unlike 4545 4544 system ioctl which exposes all supported feature bits unconditionally, vcpu 4546 4545 version has the following quirks: 4546 + 4547 4547 - HYPERV_CPUID_NESTED_FEATURES leaf and HV_X64_ENLIGHTENED_VMCS_RECOMMENDED 4548 4548 feature bit are only exposed when Enlightened VMCS was previously enabled 4549 4549 on the corresponding vCPU (KVM_CAP_HYPERV_ENLIGHTENED_VMCS).

+1 -1

arch/powerpc/kvm/book3s_64_mmu_hv.c

··· 591 591 } else { 592 592 /* Call KVM generic code to do the slow-path check */ 593 593 pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, 594 - writing, &write_ok); 594 + writing, &write_ok, NULL); 595 595 if (is_error_noslot_pfn(pfn)) 596 596 return -EFAULT; 597 597 page = NULL;

+1 -1

arch/powerpc/kvm/book3s_64_mmu_radix.c

··· 822 822 823 823 /* Call KVM generic code to do the slow-path check */ 824 824 pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, 825 - writing, upgrade_p); 825 + writing, upgrade_p, NULL); 826 826 if (is_error_noslot_pfn(pfn)) 827 827 return -EFAULT; 828 828 page = NULL;

+1 -5

arch/x86/include/asm/kvm-x86-ops.h

··· 93 93 KVM_X86_OP(handle_exit_irqoff) 94 94 KVM_X86_OP_NULL(request_immediate_exit) 95 95 KVM_X86_OP(sched_in) 96 - KVM_X86_OP_NULL(slot_enable_log_dirty) 97 - KVM_X86_OP_NULL(slot_disable_log_dirty) 98 - KVM_X86_OP_NULL(flush_log_dirty) 99 - KVM_X86_OP_NULL(enable_log_dirty_pt_masked) 100 - KVM_X86_OP_NULL(cpu_dirty_log_size) 96 + KVM_X86_OP_NULL(update_cpu_dirty_logging) 101 97 KVM_X86_OP_NULL(pre_block) 102 98 KVM_X86_OP_NULL(post_block) 103 99 KVM_X86_OP_NULL(vcpu_blocking)

+7 -29

arch/x86/include/asm/kvm_host.h

··· 89 89 KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP) 90 90 #define KVM_REQ_APF_READY KVM_ARCH_REQ(28) 91 91 #define KVM_REQ_MSR_FILTER_CHANGED KVM_ARCH_REQ(29) 92 + #define KVM_REQ_UPDATE_CPU_DIRTY_LOGGING \ 93 + KVM_ARCH_REQ_FLAGS(30, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) 92 94 93 95 #define CR0_RESERVED_BITS \ 94 96 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ ··· 1009 1007 u32 bsp_vcpu_id; 1010 1008 1011 1009 u64 disabled_quirks; 1010 + int cpu_dirty_logging_count; 1012 1011 1013 1012 enum kvm_irqchip_mode irqchip_mode; 1014 1013 u8 nr_reserved_ioapic_pins; ··· 1274 1271 void (*sched_in)(struct kvm_vcpu *kvm, int cpu); 1275 1272 1276 1273 /* 1277 - * Arch-specific dirty logging hooks. These hooks are only supposed to 1278 - * be valid if the specific arch has hardware-accelerated dirty logging 1279 - * mechanism. Currently only for PML on VMX. 1280 - * 1281 - * - slot_enable_log_dirty: 1282 - * called when enabling log dirty mode for the slot. 1283 - * - slot_disable_log_dirty: 1284 - * called when disabling log dirty mode for the slot. 1285 - * also called when slot is created with log dirty disabled. 1286 - * - flush_log_dirty: 1287 - * called before reporting dirty_bitmap to userspace. 1288 - * - enable_log_dirty_pt_masked: 1289 - * called when reenabling log dirty for the GFNs in the mask after 1290 - * corresponding bits are cleared in slot->dirty_bitmap. 1274 + * Size of the CPU's dirty log buffer, i.e. VMX's PML buffer. A zero 1275 + * value indicates CPU dirty logging is unsupported or disabled. 1291 1276 */ 1292 - void (*slot_enable_log_dirty)(struct kvm *kvm, 1293 - struct kvm_memory_slot *slot); 1294 - void (*slot_disable_log_dirty)(struct kvm *kvm, 1295 - struct kvm_memory_slot *slot); 1296 - void (*flush_log_dirty)(struct kvm *kvm); 1297 - void (*enable_log_dirty_pt_masked)(struct kvm *kvm, 1298 - struct kvm_memory_slot *slot, 1299 - gfn_t offset, unsigned long mask); 1300 - int (*cpu_dirty_log_size)(void); 1277 + int cpu_dirty_log_size; 1278 + void (*update_cpu_dirty_logging)(struct kvm_vcpu *vcpu); 1301 1279 1302 1280 /* pmu operations of sub-arch */ 1303 1281 const struct kvm_pmu_ops *pmu_ops; ··· 1421 1437 struct kvm_memory_slot *memslot); 1422 1438 void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, 1423 1439 struct kvm_memory_slot *memslot); 1424 - void kvm_mmu_slot_set_dirty(struct kvm *kvm, 1425 - struct kvm_memory_slot *memslot); 1426 - void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, 1427 - struct kvm_memory_slot *slot, 1428 - gfn_t gfn_offset, unsigned long mask); 1429 1440 void kvm_mmu_zap_all(struct kvm *kvm); 1430 1441 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen); 1431 1442 unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm); ··· 1592 1613 void kvm_update_dr7(struct kvm_vcpu *vcpu); 1593 1614 1594 1615 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn); 1595 - int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); 1596 1616 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); 1597 1617 int kvm_mmu_load(struct kvm_vcpu *vcpu); 1598 1618 void kvm_mmu_unload(struct kvm_vcpu *vcpu);

+1 -1

arch/x86/kvm/cpuid.c

··· 408 408 409 409 kvm_cpu_cap_mask(CPUID_7_0_EBX, 410 410 F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | 411 - F(BMI2) | F(ERMS) | 0 /*INVPCID*/ | F(RTM) | 0 /*MPX*/ | F(RDSEED) | 411 + F(BMI2) | F(ERMS) | F(INVPCID) | F(RTM) | 0 /*MPX*/ | F(RDSEED) | 412 412 F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) | 413 413 F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) | 414 414 F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | 0 /*INTEL_PT*/

+79 -145

arch/x86/kvm/mmu/mmu.c

··· 1165 1165 * - W bit on ad-disabled SPTEs. 1166 1166 * Returns true iff any D or W bits were cleared. 1167 1167 */ 1168 - static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) 1168 + static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1169 + struct kvm_memory_slot *slot) 1169 1170 { 1170 1171 u64 *sptep; 1171 1172 struct rmap_iterator iter; ··· 1177 1176 flush |= spte_wrprot_for_clear_dirty(sptep); 1178 1177 else 1179 1178 flush |= spte_clear_dirty(sptep); 1180 - 1181 - return flush; 1182 - } 1183 - 1184 - static bool spte_set_dirty(u64 *sptep) 1185 - { 1186 - u64 spte = *sptep; 1187 - 1188 - rmap_printk("spte %p %llx\n", sptep, *sptep); 1189 - 1190 - /* 1191 - * Similar to the !kvm_x86_ops.slot_disable_log_dirty case, 1192 - * do not bother adding back write access to pages marked 1193 - * SPTE_AD_WRPROT_ONLY_MASK. 1194 - */ 1195 - spte |= shadow_dirty_mask; 1196 - 1197 - return mmu_spte_update(sptep, spte); 1198 - } 1199 - 1200 - static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) 1201 - { 1202 - u64 *sptep; 1203 - struct rmap_iterator iter; 1204 - bool flush = false; 1205 - 1206 - for_each_rmap_spte(rmap_head, &iter, sptep) 1207 - if (spte_ad_enabled(*sptep)) 1208 - flush |= spte_set_dirty(sptep); 1209 1179 1210 1180 return flush; 1211 1181 } ··· 1220 1248 * 1221 1249 * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap. 1222 1250 */ 1223 - void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, 1224 - struct kvm_memory_slot *slot, 1225 - gfn_t gfn_offset, unsigned long mask) 1251 + static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, 1252 + struct kvm_memory_slot *slot, 1253 + gfn_t gfn_offset, unsigned long mask) 1226 1254 { 1227 1255 struct kvm_rmap_head *rmap_head; 1228 1256 ··· 1232 1260 while (mask) { 1233 1261 rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), 1234 1262 PG_LEVEL_4K, slot); 1235 - __rmap_clear_dirty(kvm, rmap_head); 1263 + __rmap_clear_dirty(kvm, rmap_head, slot); 1236 1264 1237 1265 /* clear the first set bit */ 1238 1266 mask &= mask - 1; 1239 1267 } 1240 1268 } 1241 - EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked); 1242 1269 1243 1270 /** 1244 1271 * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected ··· 1253 1282 struct kvm_memory_slot *slot, 1254 1283 gfn_t gfn_offset, unsigned long mask) 1255 1284 { 1256 - if (kvm_x86_ops.enable_log_dirty_pt_masked) 1257 - static_call(kvm_x86_enable_log_dirty_pt_masked)(kvm, slot, 1258 - gfn_offset, 1259 - mask); 1285 + if (kvm_x86_ops.cpu_dirty_log_size) 1286 + kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask); 1260 1287 else 1261 1288 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); 1262 1289 } 1263 1290 1264 1291 int kvm_cpu_dirty_log_size(void) 1265 1292 { 1266 - if (kvm_x86_ops.cpu_dirty_log_size) 1267 - return static_call(kvm_x86_cpu_dirty_log_size)(); 1268 - 1269 - return 0; 1293 + return kvm_x86_ops.cpu_dirty_log_size; 1270 1294 } 1271 1295 1272 1296 bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, ··· 1291 1325 return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn); 1292 1326 } 1293 1327 1294 - static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head) 1328 + static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1329 + struct kvm_memory_slot *slot) 1295 1330 { 1296 1331 u64 *sptep; 1297 1332 struct rmap_iterator iter; ··· 1312 1345 struct kvm_memory_slot *slot, gfn_t gfn, int level, 1313 1346 unsigned long data) 1314 1347 { 1315 - return kvm_zap_rmapp(kvm, rmap_head); 1348 + return kvm_zap_rmapp(kvm, rmap_head, slot); 1316 1349 } 1317 1350 1318 1351 static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, ··· 2466 2499 2467 2500 return r; 2468 2501 } 2469 - EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page); 2502 + 2503 + static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) 2504 + { 2505 + gpa_t gpa; 2506 + int r; 2507 + 2508 + if (vcpu->arch.mmu->direct_map) 2509 + return 0; 2510 + 2511 + gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); 2512 + 2513 + r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); 2514 + 2515 + return r; 2516 + } 2470 2517 2471 2518 static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 2472 2519 { ··· 2734 2753 if (sp->role.level > PG_LEVEL_4K) 2735 2754 return; 2736 2755 2756 + /* 2757 + * If addresses are being invalidated, skip prefetching to avoid 2758 + * accidentally prefetching those addresses. 2759 + */ 2760 + if (unlikely(vcpu->kvm->mmu_notifier_count)) 2761 + return; 2762 + 2737 2763 __direct_pte_prefetch(vcpu, sp, sptep); 2738 2764 } 2739 2765 2740 - static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, 2741 - kvm_pfn_t pfn, struct kvm_memory_slot *slot) 2766 + static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, 2767 + struct kvm_memory_slot *slot) 2742 2768 { 2743 2769 unsigned long hva; 2744 2770 pte_t *pte; ··· 2764 2776 */ 2765 2777 hva = __gfn_to_hva_memslot(slot, gfn); 2766 2778 2767 - pte = lookup_address_in_mm(vcpu->kvm->mm, hva, &level); 2779 + pte = lookup_address_in_mm(kvm->mm, hva, &level); 2768 2780 if (unlikely(!pte)) 2769 2781 return PG_LEVEL_4K; 2770 2782 2771 2783 return level; 2784 + } 2785 + 2786 + int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot, 2787 + gfn_t gfn, kvm_pfn_t pfn, int max_level) 2788 + { 2789 + struct kvm_lpage_info *linfo; 2790 + 2791 + max_level = min(max_level, max_huge_page_level); 2792 + for ( ; max_level > PG_LEVEL_4K; max_level--) { 2793 + linfo = lpage_info_slot(gfn, slot, max_level); 2794 + if (!linfo->disallow_lpage) 2795 + break; 2796 + } 2797 + 2798 + if (max_level == PG_LEVEL_4K) 2799 + return PG_LEVEL_4K; 2800 + 2801 + return host_pfn_mapping_level(kvm, gfn, pfn, slot); 2772 2802 } 2773 2803 2774 2804 int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, ··· 2794 2788 bool huge_page_disallowed, int *req_level) 2795 2789 { 2796 2790 struct kvm_memory_slot *slot; 2797 - struct kvm_lpage_info *linfo; 2798 2791 kvm_pfn_t pfn = *pfnp; 2799 2792 kvm_pfn_t mask; 2800 2793 int level; ··· 2810 2805 if (!slot) 2811 2806 return PG_LEVEL_4K; 2812 2807 2813 - max_level = min(max_level, max_huge_page_level); 2814 - for ( ; max_level > PG_LEVEL_4K; max_level--) { 2815 - linfo = lpage_info_slot(gfn, slot, max_level); 2816 - if (!linfo->disallow_lpage) 2817 - break; 2818 - } 2819 - 2820 - if (max_level == PG_LEVEL_4K) 2821 - return PG_LEVEL_4K; 2822 - 2823 - level = host_pfn_mapping_level(vcpu, gfn, pfn, slot); 2808 + level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level); 2824 2809 if (level == PG_LEVEL_4K) 2825 2810 return level; 2826 2811 ··· 3432 3437 kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); 3433 3438 write_unlock(&vcpu->kvm->mmu_lock); 3434 3439 } 3435 - EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots); 3436 3440 3437 3441 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gpa_t vaddr, 3438 3442 u32 access, struct x86_exception *exception) ··· 3647 3653 } 3648 3654 3649 3655 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, 3650 - gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write, 3651 - bool *writable) 3656 + gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva, 3657 + bool write, bool *writable) 3652 3658 { 3653 3659 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 3654 3660 bool async; ··· 3661 3667 } 3662 3668 3663 3669 async = false; 3664 - *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable); 3670 + *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, 3671 + write, writable, hva); 3665 3672 if (!async) 3666 3673 return false; /* *pfn has correct page already */ 3667 3674 ··· 3676 3681 return true; 3677 3682 } 3678 3683 3679 - *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable); 3684 + *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, 3685 + write, writable, hva); 3680 3686 return false; 3681 3687 } 3682 3688 ··· 3690 3694 gfn_t gfn = gpa >> PAGE_SHIFT; 3691 3695 unsigned long mmu_seq; 3692 3696 kvm_pfn_t pfn; 3697 + hva_t hva; 3693 3698 int r; 3694 3699 3695 3700 if (page_fault_handle_page_track(vcpu, error_code, gfn)) ··· 3709 3712 mmu_seq = vcpu->kvm->mmu_notifier_seq; 3710 3713 smp_rmb(); 3711 3714 3712 - if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) 3715 + if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, &hva, 3716 + write, &map_writable)) 3713 3717 return RET_PF_RETRY; 3714 3718 3715 3719 if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r)) ··· 3723 3725 else 3724 3726 write_lock(&vcpu->kvm->mmu_lock); 3725 3727 3726 - if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) 3728 + if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva)) 3727 3729 goto out_unlock; 3728 3730 r = make_mmu_pages_available(vcpu); 3729 3731 if (r) ··· 5001 5003 write_unlock(&vcpu->kvm->mmu_lock); 5002 5004 } 5003 5005 5004 - int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) 5005 - { 5006 - gpa_t gpa; 5007 - int r; 5008 - 5009 - if (vcpu->arch.mmu->direct_map) 5010 - return 0; 5011 - 5012 - gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); 5013 - 5014 - r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); 5015 - 5016 - return r; 5017 - } 5018 - EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); 5019 - 5020 5006 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, 5021 5007 void *insn, int insn_len) 5022 5008 { ··· 5099 5117 mmu->invlpg(vcpu, gva, root_hpa); 5100 5118 } 5101 5119 } 5102 - EXPORT_SYMBOL_GPL(kvm_mmu_invalidate_gva); 5103 5120 5104 5121 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) 5105 5122 { ··· 5138 5157 * for them. 5139 5158 */ 5140 5159 } 5141 - EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva); 5142 5160 5143 5161 void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, 5144 5162 int tdp_huge_page_level) ··· 5162 5182 EXPORT_SYMBOL_GPL(kvm_configure_mmu); 5163 5183 5164 5184 /* The return value indicates if tlb flush on all vcpus is needed. */ 5165 - typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head); 5185 + typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head, 5186 + struct kvm_memory_slot *slot); 5166 5187 5167 5188 /* The caller should hold mmu-lock before calling this function. */ 5168 5189 static __always_inline bool ··· 5177 5196 for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn, 5178 5197 end_gfn, &iterator) { 5179 5198 if (iterator.rmap) 5180 - flush |= fn(kvm, iterator.rmap); 5199 + flush |= fn(kvm, iterator.rmap, memslot); 5181 5200 5182 5201 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { 5183 5202 if (flush && lock_flush_tlb) { ··· 5208 5227 end_level, memslot->base_gfn, 5209 5228 memslot->base_gfn + memslot->npages - 1, 5210 5229 lock_flush_tlb); 5211 - } 5212 - 5213 - static __always_inline bool 5214 - slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot, 5215 - slot_level_handler fn, bool lock_flush_tlb) 5216 - { 5217 - return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K, 5218 - KVM_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); 5219 - } 5220 - 5221 - static __always_inline bool 5222 - slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot, 5223 - slot_level_handler fn, bool lock_flush_tlb) 5224 - { 5225 - return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K + 1, 5226 - KVM_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); 5227 5230 } 5228 5231 5229 5232 static __always_inline bool ··· 5450 5485 } 5451 5486 5452 5487 static bool slot_rmap_write_protect(struct kvm *kvm, 5453 - struct kvm_rmap_head *rmap_head) 5488 + struct kvm_rmap_head *rmap_head, 5489 + struct kvm_memory_slot *slot) 5454 5490 { 5455 5491 return __rmap_write_protect(kvm, rmap_head, false); 5456 5492 } ··· 5485 5519 } 5486 5520 5487 5521 static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, 5488 - struct kvm_rmap_head *rmap_head) 5522 + struct kvm_rmap_head *rmap_head, 5523 + struct kvm_memory_slot *slot) 5489 5524 { 5490 5525 u64 *sptep; 5491 5526 struct rmap_iterator iter; ··· 5507 5540 * mapping if the indirect sp has level = 1. 5508 5541 */ 5509 5542 if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && 5510 - (kvm_is_zone_device_pfn(pfn) || 5511 - PageCompound(pfn_to_page(pfn)))) { 5543 + sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, 5544 + pfn, PG_LEVEL_NUM)) { 5512 5545 pte_list_remove(rmap_head, sptep); 5513 5546 5514 5547 if (kvm_available_flush_tlb_with_range()) ··· 5528 5561 const struct kvm_memory_slot *memslot) 5529 5562 { 5530 5563 /* FIXME: const-ify all uses of struct kvm_memory_slot. */ 5564 + struct kvm_memory_slot *slot = (struct kvm_memory_slot *)memslot; 5565 + 5531 5566 write_lock(&kvm->mmu_lock); 5532 - slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot, 5533 - kvm_mmu_zap_collapsible_spte, true); 5567 + slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true); 5534 5568 5535 5569 if (is_tdp_mmu_enabled(kvm)) 5536 - kvm_tdp_mmu_zap_collapsible_sptes(kvm, memslot); 5570 + kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot); 5537 5571 write_unlock(&kvm->mmu_lock); 5538 5572 } 5539 5573 ··· 5573 5605 if (flush) 5574 5606 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); 5575 5607 } 5576 - EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty); 5577 - 5578 - void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, 5579 - struct kvm_memory_slot *memslot) 5580 - { 5581 - bool flush; 5582 - 5583 - write_lock(&kvm->mmu_lock); 5584 - flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect, 5585 - false); 5586 - if (is_tdp_mmu_enabled(kvm)) 5587 - flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_2M); 5588 - write_unlock(&kvm->mmu_lock); 5589 - 5590 - if (flush) 5591 - kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); 5592 - } 5593 - EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access); 5594 - 5595 - void kvm_mmu_slot_set_dirty(struct kvm *kvm, 5596 - struct kvm_memory_slot *memslot) 5597 - { 5598 - bool flush; 5599 - 5600 - write_lock(&kvm->mmu_lock); 5601 - flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false); 5602 - if (is_tdp_mmu_enabled(kvm)) 5603 - flush |= kvm_tdp_mmu_slot_set_dirty(kvm, memslot); 5604 - write_unlock(&kvm->mmu_lock); 5605 - 5606 - if (flush) 5607 - kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); 5608 - } 5609 - EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty); 5610 5608 5611 5609 void kvm_mmu_zap_all(struct kvm *kvm) 5612 5610 {

+6 -1

arch/x86/kvm/mmu/mmu_internal.h

··· 84 84 * When using the EPT page-modification log, the GPAs in the log 85 85 * would come from L2 rather than L1. Therefore, we need to rely 86 86 * on write protection to record dirty pages. This also bypasses 87 - * PML, since writes now result in a vmexit. 87 + * PML, since writes now result in a vmexit. Note, this helper will 88 + * tag SPTEs as needing write-protection even if PML is disabled or 89 + * unsupported, but that's ok because the tag is consumed if and only 90 + * if PML is enabled. Omit the PML check to save a few uops. 88 91 */ 89 92 return vcpu->arch.mmu == &vcpu->arch.guest_mmu; 90 93 } ··· 141 138 #define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) 142 139 #define SET_SPTE_SPURIOUS BIT(2) 143 140 141 + int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot, 142 + gfn_t gfn, kvm_pfn_t pfn, int max_level); 144 143 int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, 145 144 int max_level, kvm_pfn_t *pfnp, 146 145 bool huge_page_disallowed, int *req_level);

+11 -3

arch/x86/kvm/mmu/paging_tmpl.h

··· 601 601 if (sp->role.level > PG_LEVEL_4K) 602 602 return; 603 603 604 + /* 605 + * If addresses are being invalidated, skip prefetching to avoid 606 + * accidentally prefetching those addresses. 607 + */ 608 + if (unlikely(vcpu->kvm->mmu_notifier_count)) 609 + return; 610 + 604 611 if (sp->role.direct) 605 612 return __direct_pte_prefetch(vcpu, sp, sptep); 606 613 ··· 797 790 struct guest_walker walker; 798 791 int r; 799 792 kvm_pfn_t pfn; 793 + hva_t hva; 800 794 unsigned long mmu_seq; 801 795 bool map_writable, is_self_change_mapping; 802 796 int max_level; ··· 848 840 mmu_seq = vcpu->kvm->mmu_notifier_seq; 849 841 smp_rmb(); 850 842 851 - if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault, 852 - &map_writable)) 843 + if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, &hva, 844 + write_fault, &map_writable)) 853 845 return RET_PF_RETRY; 854 846 855 847 if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r)) ··· 877 869 878 870 r = RET_PF_RETRY; 879 871 write_lock(&vcpu->kvm->mmu_lock); 880 - if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) 872 + if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva)) 881 873 goto out_unlock; 882 874 883 875 kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);

+7 -59

arch/x86/kvm/mmu/tdp_mmu.c

··· 1269 1269 } 1270 1270 1271 1271 /* 1272 - * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is 1273 - * only used for PML, and so will involve setting the dirty bit on each SPTE. 1274 - * Returns true if an SPTE has been changed and the TLBs need to be flushed. 1275 - */ 1276 - static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1277 - gfn_t start, gfn_t end) 1278 - { 1279 - struct tdp_iter iter; 1280 - u64 new_spte; 1281 - bool spte_set = false; 1282 - 1283 - rcu_read_lock(); 1284 - 1285 - tdp_root_for_each_pte(iter, root, start, end) { 1286 - if (tdp_mmu_iter_cond_resched(kvm, &iter, false)) 1287 - continue; 1288 - 1289 - if (!is_shadow_present_pte(iter.old_spte) || 1290 - iter.old_spte & shadow_dirty_mask) 1291 - continue; 1292 - 1293 - new_spte = iter.old_spte | shadow_dirty_mask; 1294 - 1295 - tdp_mmu_set_spte(kvm, &iter, new_spte); 1296 - spte_set = true; 1297 - } 1298 - 1299 - rcu_read_unlock(); 1300 - return spte_set; 1301 - } 1302 - 1303 - /* 1304 - * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is 1305 - * only used for PML, and so will involve setting the dirty bit on each SPTE. 1306 - * Returns true if an SPTE has been changed and the TLBs need to be flushed. 1307 - */ 1308 - bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot) 1309 - { 1310 - struct kvm_mmu_page *root; 1311 - int root_as_id; 1312 - bool spte_set = false; 1313 - 1314 - for_each_tdp_mmu_root_yield_safe(kvm, root) { 1315 - root_as_id = kvm_mmu_page_as_id(root); 1316 - if (root_as_id != slot->as_id) 1317 - continue; 1318 - 1319 - spte_set |= set_dirty_gfn_range(kvm, root, slot->base_gfn, 1320 - slot->base_gfn + slot->npages); 1321 - } 1322 - return spte_set; 1323 - } 1324 - 1325 - /* 1326 1272 * Clear leaf entries which could be replaced by large mappings, for 1327 1273 * GFNs within the slot. 1328 1274 */ 1329 1275 static void zap_collapsible_spte_range(struct kvm *kvm, 1330 1276 struct kvm_mmu_page *root, 1331 - gfn_t start, gfn_t end) 1277 + struct kvm_memory_slot *slot) 1332 1278 { 1279 + gfn_t start = slot->base_gfn; 1280 + gfn_t end = start + slot->npages; 1333 1281 struct tdp_iter iter; 1334 1282 kvm_pfn_t pfn; 1335 1283 bool spte_set = false; ··· 1296 1348 1297 1349 pfn = spte_to_pfn(iter.old_spte); 1298 1350 if (kvm_is_reserved_pfn(pfn) || 1299 - !PageTransCompoundMap(pfn_to_page(pfn))) 1351 + iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn, 1352 + pfn, PG_LEVEL_NUM)) 1300 1353 continue; 1301 1354 1302 1355 tdp_mmu_set_spte(kvm, &iter, 0); ··· 1315 1366 * be replaced by large mappings, for GFNs within the slot. 1316 1367 */ 1317 1368 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, 1318 - const struct kvm_memory_slot *slot) 1369 + struct kvm_memory_slot *slot) 1319 1370 { 1320 1371 struct kvm_mmu_page *root; 1321 1372 int root_as_id; ··· 1325 1376 if (root_as_id != slot->as_id) 1326 1377 continue; 1327 1378 1328 - zap_collapsible_spte_range(kvm, root, slot->base_gfn, 1329 - slot->base_gfn + slot->npages); 1379 + zap_collapsible_spte_range(kvm, root, slot); 1330 1380 } 1331 1381 } 1332 1382

+1 -2

arch/x86/kvm/mmu/tdp_mmu.h

··· 33 33 struct kvm_memory_slot *slot, 34 34 gfn_t gfn, unsigned long mask, 35 35 bool wrprot); 36 - bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot); 37 36 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, 38 - const struct kvm_memory_slot *slot); 37 + struct kvm_memory_slot *slot); 39 38 40 39 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, 41 40 struct kvm_memory_slot *slot, gfn_t gfn);

+35 -13

arch/x86/kvm/svm/nested.c

··· 51 51 nested_svm_vmexit(svm); 52 52 } 53 53 54 + static void svm_inject_page_fault_nested(struct kvm_vcpu *vcpu, struct x86_exception *fault) 55 + { 56 + struct vcpu_svm *svm = to_svm(vcpu); 57 + WARN_ON(!is_guest_mode(vcpu)); 58 + 59 + if (vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_EXCEPTION_OFFSET + PF_VECTOR) && 60 + !svm->nested.nested_run_pending) { 61 + svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + PF_VECTOR; 62 + svm->vmcb->control.exit_code_hi = 0; 63 + svm->vmcb->control.exit_info_1 = fault->error_code; 64 + svm->vmcb->control.exit_info_2 = fault->address; 65 + nested_svm_vmexit(svm); 66 + } else { 67 + kvm_inject_page_fault(vcpu, fault); 68 + } 69 + } 70 + 54 71 static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index) 55 72 { 56 73 struct vcpu_svm *svm = to_svm(vcpu); ··· 453 436 { 454 437 int ret; 455 438 439 + trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa, 440 + vmcb12->save.rip, 441 + vmcb12->control.int_ctl, 442 + vmcb12->control.event_inj, 443 + vmcb12->control.nested_ctl); 444 + 445 + trace_kvm_nested_intercepts(vmcb12->control.intercepts[INTERCEPT_CR] & 0xffff, 446 + vmcb12->control.intercepts[INTERCEPT_CR] >> 16, 447 + vmcb12->control.intercepts[INTERCEPT_EXCEPTION], 448 + vmcb12->control.intercepts[INTERCEPT_WORD3], 449 + vmcb12->control.intercepts[INTERCEPT_WORD4], 450 + vmcb12->control.intercepts[INTERCEPT_WORD5]); 451 + 452 + 456 453 svm->nested.vmcb12_gpa = vmcb12_gpa; 457 454 load_nested_vmcb_control(svm, &vmcb12->control); 458 - nested_prepare_vmcb_save(svm, vmcb12); 459 455 nested_prepare_vmcb_control(svm); 456 + nested_prepare_vmcb_save(svm, vmcb12); 460 457 461 458 ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3, 462 459 nested_npt_enabled(svm)); 463 460 if (ret) 464 461 return ret; 462 + 463 + if (!npt_enabled) 464 + svm->vcpu.arch.mmu->inject_page_fault = svm_inject_page_fault_nested; 465 465 466 466 svm_set_gif(svm, true); 467 467 ··· 523 489 goto out; 524 490 } 525 491 526 - trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa, 527 - vmcb12->save.rip, 528 - vmcb12->control.int_ctl, 529 - vmcb12->control.event_inj, 530 - vmcb12->control.nested_ctl); 531 - 532 - trace_kvm_nested_intercepts(vmcb12->control.intercepts[INTERCEPT_CR] & 0xffff, 533 - vmcb12->control.intercepts[INTERCEPT_CR] >> 16, 534 - vmcb12->control.intercepts[INTERCEPT_EXCEPTION], 535 - vmcb12->control.intercepts[INTERCEPT_WORD3], 536 - vmcb12->control.intercepts[INTERCEPT_WORD4], 537 - vmcb12->control.intercepts[INTERCEPT_WORD5]); 538 492 539 493 /* Clear internal status */ 540 494 kvm_clear_exception_queue(&svm->vcpu);

+12 -10

arch/x86/kvm/svm/svm.c

··· 926 926 if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || 927 927 boot_cpu_has(X86_FEATURE_AMD_SSBD)) 928 928 kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); 929 - 930 - /* Enable INVPCID feature */ 931 - kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID); 932 929 } 933 930 934 931 static __init int svm_hardware_setup(void) ··· 1100 1103 static void svm_check_invpcid(struct vcpu_svm *svm) 1101 1104 { 1102 1105 /* 1103 - * Intercept INVPCID instruction only if shadow page table is 1104 - * enabled. Interception is not required with nested page table 1105 - * enabled. 1106 + * Intercept INVPCID if shadow paging is enabled to sync/free shadow 1107 + * roots, or if INVPCID is disabled in the guest to inject #UD. 1106 1108 */ 1107 1109 if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) { 1108 - if (!npt_enabled) 1110 + if (!npt_enabled || 1111 + !guest_cpuid_has(&svm->vcpu, X86_FEATURE_INVPCID)) 1109 1112 svm_set_intercept(svm, INTERCEPT_INVPCID); 1110 1113 else 1111 1114 svm_clr_intercept(svm, INTERCEPT_INVPCID); ··· 2211 2214 [SVM_INSTR_VMSAVE] = vmsave_interception, 2212 2215 }; 2213 2216 struct vcpu_svm *svm = to_svm(vcpu); 2217 + int ret; 2214 2218 2215 2219 if (is_guest_mode(vcpu)) { 2216 2220 svm->vmcb->control.exit_code = guest_mode_exit_codes[opcode]; 2217 2221 svm->vmcb->control.exit_info_1 = 0; 2218 2222 svm->vmcb->control.exit_info_2 = 0; 2219 2223 2220 - return nested_svm_vmexit(svm); 2221 - } else 2222 - return svm_instr_handlers[opcode](svm); 2224 + /* Returns '1' or -errno on failure, '0' on success. */ 2225 + ret = nested_svm_vmexit(svm); 2226 + if (ret) 2227 + return ret; 2228 + return 1; 2229 + } 2230 + return svm_instr_handlers[opcode](svm); 2223 2231 } 2224 2232 2225 2233 /*

+20 -17

arch/x86/kvm/vmx/nested.c

··· 2167 2167 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); 2168 2168 2169 2169 /* 2170 - * The PML address never changes, so it is constant in vmcs02. 2171 - * Conceptually we want to copy the PML index from vmcs01 here, 2172 - * and then back to vmcs01 on nested vmexit. But since we flush 2173 - * the log and reset GUEST_PML_INDEX on each vmexit, the PML 2174 - * index is also effectively constant in vmcs02. 2170 + * PML is emulated for L2, but never enabled in hardware as the MMU 2171 + * handles A/D emulation. Disabling PML for L2 also avoids having to 2172 + * deal with filtering out L2 GPAs from the buffer. 2175 2173 */ 2176 2174 if (enable_pml) { 2177 - vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); 2178 - vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); 2175 + vmcs_write64(PML_ADDRESS, 0); 2176 + vmcs_write16(GUEST_PML_INDEX, -1); 2179 2177 } 2180 2178 2181 2179 if (cpu_has_vmx_encls_vmexit()) ··· 2208 2210 2209 2211 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2210 2212 { 2211 - u32 exec_control, vmcs12_exec_ctrl; 2213 + u32 exec_control; 2212 2214 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); 2213 2215 2214 2216 if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) ··· 2282 2284 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2283 2285 SECONDARY_EXEC_ENABLE_VMFUNC); 2284 2286 if (nested_cpu_has(vmcs12, 2285 - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) { 2286 - vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control & 2287 - ~SECONDARY_EXEC_ENABLE_PML; 2288 - exec_control |= vmcs12_exec_ctrl; 2289 - } 2287 + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) 2288 + exec_control |= vmcs12->secondary_vm_exec_control; 2289 + 2290 + /* PML is emulated and never enabled in hardware for L2. */ 2291 + exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 2290 2292 2291 2293 /* VMCS shadowing for L2 is emulated for now */ 2292 2294 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; ··· 4198 4200 if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &ignored)) 4199 4201 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); 4200 4202 4201 - if (!enable_ept) 4202 - vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; 4203 - 4204 4203 nested_vmx_transition_tlb_flush(vcpu, vmcs12, false); 4205 4204 4206 4205 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); ··· 4488 4493 if (vmx->nested.change_vmcs01_virtual_apic_mode) { 4489 4494 vmx->nested.change_vmcs01_virtual_apic_mode = false; 4490 4495 vmx_set_virtual_apic_mode(vcpu); 4496 + } 4497 + 4498 + if (vmx->nested.update_vmcs01_cpu_dirty_logging) { 4499 + vmx->nested.update_vmcs01_cpu_dirty_logging = false; 4500 + vmx_update_cpu_dirty_logging(vcpu); 4491 4501 } 4492 4502 4493 4503 /* Unpin physical memory we referred to in vmcs02 */ ··· 5793 5793 case EXIT_REASON_PREEMPTION_TIMER: 5794 5794 return true; 5795 5795 case EXIT_REASON_PML_FULL: 5796 - /* We emulate PML support to L1. */ 5796 + /* 5797 + * PML is emulated for an L1 VMM and should never be enabled in 5798 + * vmcs02, always "handle" PML_FULL by exiting to userspace. 5799 + */ 5797 5800 return true; 5798 5801 case EXIT_REASON_VMFUNC: 5799 5802 /* VM functions are emulated through L2->L0 vmexits. */

+2 -2

arch/x86/kvm/vmx/pmu_intel.c

··· 298 298 if (IS_ERR(event)) { 299 299 pr_debug_ratelimited("%s: failed %ld\n", 300 300 __func__, PTR_ERR(event)); 301 - return -ENOENT; 301 + return PTR_ERR(event); 302 302 } 303 303 lbr_desc->event = event; 304 304 pmu->event_count++; ··· 320 320 if (!intel_pmu_is_valid_lbr_msr(vcpu, index)) 321 321 return false; 322 322 323 - if (!lbr_desc->event && !intel_pmu_create_guest_lbr_event(vcpu)) 323 + if (!lbr_desc->event && intel_pmu_create_guest_lbr_event(vcpu) < 0) 324 324 goto dummy; 325 325 326 326 /*

+40 -72

arch/x86/kvm/vmx/vmx.c

··· 4277 4277 */ 4278 4278 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 4279 4279 4280 - if (!enable_pml) 4280 + /* 4281 + * PML is enabled/disabled when dirty logging of memsmlots changes, but 4282 + * it needs to be set here when dirty logging is already active, e.g. 4283 + * if this vCPU was created after dirty logging was enabled. 4284 + */ 4285 + if (!vcpu->kvm->arch.cpu_dirty_logging_count) 4281 4286 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 4282 4287 4283 4288 if (cpu_has_vmx_xsaves()) { ··· 4300 4295 } 4301 4296 4302 4297 vmx_adjust_sec_exec_feature(vmx, &exec_control, rdtscp, RDTSCP); 4303 - 4304 - /* 4305 - * Expose INVPCID if and only if PCID is also exposed to the guest. 4306 - * INVPCID takes a #UD when it's disabled in the VMCS, but a #GP or #PF 4307 - * if CR4.PCIDE=0. Enumerating CPUID.INVPCID=1 would lead to incorrect 4308 - * behavior from the guest perspective (it would expect #GP or #PF). 4309 - */ 4310 - if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID)) 4311 - guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID); 4312 4298 vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID); 4313 - 4314 4299 4315 4300 vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND); 4316 4301 vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED); ··· 5771 5776 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); 5772 5777 } 5773 5778 5774 - /* 5775 - * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap. 5776 - * Called before reporting dirty_bitmap to userspace. 5777 - */ 5778 - static void kvm_flush_pml_buffers(struct kvm *kvm) 5779 - { 5780 - int i; 5781 - struct kvm_vcpu *vcpu; 5782 - /* 5783 - * We only need to kick vcpu out of guest mode here, as PML buffer 5784 - * is flushed at beginning of all VMEXITs, and it's obvious that only 5785 - * vcpus running in guest are possible to have unflushed GPAs in PML 5786 - * buffer. 5787 - */ 5788 - kvm_for_each_vcpu(i, vcpu, kvm) 5789 - kvm_vcpu_kick(vcpu); 5790 - } 5791 - 5792 5779 static void vmx_dump_sel(char *name, uint32_t sel) 5793 5780 { 5794 5781 pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", ··· 5953 5976 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before 5954 5977 * querying dirty_bitmap, we only need to kick all vcpus out of guest 5955 5978 * mode as if vcpus is in root mode, the PML buffer must has been 5956 - * flushed already. 5979 + * flushed already. Note, PML is never enabled in hardware while 5980 + * running L2. 5957 5981 */ 5958 - if (enable_pml) 5982 + if (enable_pml && !is_guest_mode(vcpu)) 5959 5983 vmx_flush_pml_buffer(vcpu); 5960 5984 5961 5985 /* ··· 5972 5994 return handle_invalid_guest_state(vcpu); 5973 5995 5974 5996 if (is_guest_mode(vcpu)) { 5997 + /* 5998 + * PML is never enabled when running L2, bail immediately if a 5999 + * PML full exit occurs as something is horribly wrong. 6000 + */ 6001 + if (exit_reason.basic == EXIT_REASON_PML_FULL) 6002 + goto unexpected_vmexit; 6003 + 5975 6004 /* 5976 6005 * The host physical addresses of some pages of guest memory 5977 6006 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC ··· 6836 6851 if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY)) 6837 6852 kvm_machine_check(); 6838 6853 6854 + if (likely(!vmx->exit_reason.failed_vmentry)) 6855 + vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 6856 + 6839 6857 trace_kvm_exit(vmx->exit_reason.full, vcpu, KVM_ISA_VMX); 6840 6858 6841 6859 if (unlikely(vmx->exit_reason.failed_vmentry)) 6842 6860 return EXIT_FASTPATH_NONE; 6843 6861 6844 6862 vmx->loaded_vmcs->launched = 1; 6845 - vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 6846 6863 6847 6864 vmx_recover_nmi_blocking(vmx); 6848 6865 vmx_complete_interrupts(vmx); ··· 7317 7330 /* CPUID 0x7 */ 7318 7331 if (kvm_mpx_supported()) 7319 7332 kvm_cpu_cap_check_and_set(X86_FEATURE_MPX); 7320 - if (cpu_has_vmx_invpcid()) 7321 - kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID); 7333 + if (!cpu_has_vmx_invpcid()) 7334 + kvm_cpu_cap_clear(X86_FEATURE_INVPCID); 7322 7335 if (vmx_pt_mode_is_host_guest()) 7323 7336 kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT); 7324 7337 ··· 7496 7509 shrink_ple_window(vcpu); 7497 7510 } 7498 7511 7499 - static void vmx_slot_enable_log_dirty(struct kvm *kvm, 7500 - struct kvm_memory_slot *slot) 7512 + void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) 7501 7513 { 7502 - if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) 7503 - kvm_mmu_slot_leaf_clear_dirty(kvm, slot); 7504 - kvm_mmu_slot_largepage_remove_write_access(kvm, slot); 7505 - } 7514 + struct vcpu_vmx *vmx = to_vmx(vcpu); 7506 7515 7507 - static void vmx_slot_disable_log_dirty(struct kvm *kvm, 7508 - struct kvm_memory_slot *slot) 7509 - { 7510 - kvm_mmu_slot_set_dirty(kvm, slot); 7511 - } 7516 + if (is_guest_mode(vcpu)) { 7517 + vmx->nested.update_vmcs01_cpu_dirty_logging = true; 7518 + return; 7519 + } 7512 7520 7513 - static void vmx_flush_log_dirty(struct kvm *kvm) 7514 - { 7515 - kvm_flush_pml_buffers(kvm); 7516 - } 7517 - 7518 - static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, 7519 - struct kvm_memory_slot *memslot, 7520 - gfn_t offset, unsigned long mask) 7521 - { 7522 - kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); 7521 + /* 7522 + * Note, cpu_dirty_logging_count can be changed concurrent with this 7523 + * code, but in that case another update request will be made and so 7524 + * the guest will never run with a stale PML value. 7525 + */ 7526 + if (vcpu->kvm->arch.cpu_dirty_logging_count) 7527 + secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML); 7528 + else 7529 + secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML); 7523 7530 } 7524 7531 7525 7532 static int vmx_pre_block(struct kvm_vcpu *vcpu) ··· 7623 7642 return supported & BIT(bit); 7624 7643 } 7625 7644 7626 - static int vmx_cpu_dirty_log_size(void) 7627 - { 7628 - return enable_pml ? PML_ENTITY_NUM : 0; 7629 - } 7630 - 7631 7645 static struct kvm_x86_ops vmx_x86_ops __initdata = { 7632 7646 .hardware_unsetup = hardware_unsetup, 7633 7647 ··· 7722 7746 7723 7747 .sched_in = vmx_sched_in, 7724 7748 7725 - .slot_enable_log_dirty = vmx_slot_enable_log_dirty, 7726 - .slot_disable_log_dirty = vmx_slot_disable_log_dirty, 7727 - .flush_log_dirty = vmx_flush_log_dirty, 7728 - .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, 7749 + .cpu_dirty_log_size = PML_ENTITY_NUM, 7750 + .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging, 7729 7751 7730 7752 .pre_block = vmx_pre_block, 7731 7753 .post_block = vmx_post_block, ··· 7751 7777 7752 7778 .msr_filter_changed = vmx_msr_filter_changed, 7753 7779 .complete_emulated_msr = kvm_complete_insn_gp, 7754 - .cpu_dirty_log_size = vmx_cpu_dirty_log_size, 7755 7780 7756 7781 .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector, 7757 7782 }; ··· 7867 7894 if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) 7868 7895 enable_pml = 0; 7869 7896 7870 - if (!enable_pml) { 7871 - vmx_x86_ops.slot_enable_log_dirty = NULL; 7872 - vmx_x86_ops.slot_disable_log_dirty = NULL; 7873 - vmx_x86_ops.flush_log_dirty = NULL; 7874 - vmx_x86_ops.enable_log_dirty_pt_masked = NULL; 7875 - vmx_x86_ops.cpu_dirty_log_size = NULL; 7876 - } 7897 + if (!enable_pml) 7898 + vmx_x86_ops.cpu_dirty_log_size = 0; 7877 7899 7878 7900 if (!cpu_has_vmx_preemption_timer()) 7879 7901 enable_preemption_timer = false;

+2

arch/x86/kvm/vmx/vmx.h

··· 165 165 166 166 bool change_vmcs01_virtual_apic_mode; 167 167 bool reload_vmcs01_apic_access_page; 168 + bool update_vmcs01_cpu_dirty_logging; 168 169 169 170 /* 170 171 * Enlightened VMCS has been enabled. It does not mean that L1 has to ··· 394 393 void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu); 395 394 void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, 396 395 u32 msr, int type, bool value); 396 + void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu); 397 397 398 398 static inline u8 vmx_get_rvi(void) 399 399 {

+90 -59

arch/x86/kvm/x86.c

··· 5215 5215 5216 5216 void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) 5217 5217 { 5218 + 5218 5219 /* 5219 - * Flush potentially hardware-cached dirty pages to dirty_bitmap. 5220 + * Flush all CPUs' dirty log buffers to the dirty_bitmap. Called 5221 + * before reporting dirty_bitmap to userspace. KVM flushes the buffers 5222 + * on all VM-Exits, thus we only need to kick running vCPUs to force a 5223 + * VM-Exit. 5220 5224 */ 5221 - static_call_cond(kvm_x86_flush_log_dirty)(kvm); 5225 + struct kvm_vcpu *vcpu; 5226 + int i; 5227 + 5228 + kvm_for_each_vcpu(i, vcpu, kvm) 5229 + kvm_vcpu_kick(vcpu); 5222 5230 } 5223 5231 5224 5232 int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, ··· 8988 8980 kvm_check_async_pf_completion(vcpu); 8989 8981 if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu)) 8990 8982 static_call(kvm_x86_msr_filter_changed)(vcpu); 8983 + 8984 + if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu)) 8985 + static_call(kvm_x86_update_cpu_dirty_logging)(vcpu); 8991 8986 } 8992 8987 8993 8988 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win || ··· 10759 10748 return 0; 10760 10749 } 10761 10750 10751 + 10752 + static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable) 10753 + { 10754 + struct kvm_arch *ka = &kvm->arch; 10755 + 10756 + if (!kvm_x86_ops.cpu_dirty_log_size) 10757 + return; 10758 + 10759 + if ((enable && ++ka->cpu_dirty_logging_count == 1) || 10760 + (!enable && --ka->cpu_dirty_logging_count == 0)) 10761 + kvm_make_all_cpus_request(kvm, KVM_REQ_UPDATE_CPU_DIRTY_LOGGING); 10762 + 10763 + WARN_ON_ONCE(ka->cpu_dirty_logging_count < 0); 10764 + } 10765 + 10762 10766 static void kvm_mmu_slot_apply_flags(struct kvm *kvm, 10763 10767 struct kvm_memory_slot *old, 10764 10768 struct kvm_memory_slot *new, 10765 10769 enum kvm_mr_change change) 10766 10770 { 10771 + bool log_dirty_pages = new->flags & KVM_MEM_LOG_DIRTY_PAGES; 10772 + 10767 10773 /* 10768 - * Nothing to do for RO slots or CREATE/MOVE/DELETE of a slot. 10769 - * See comments below. 10774 + * Update CPU dirty logging if dirty logging is being toggled. This 10775 + * applies to all operations. 10776 + */ 10777 + if ((old->flags ^ new->flags) & KVM_MEM_LOG_DIRTY_PAGES) 10778 + kvm_mmu_update_cpu_dirty_logging(kvm, log_dirty_pages); 10779 + 10780 + /* 10781 + * Nothing more to do for RO slots (which can't be dirtied and can't be 10782 + * made writable) or CREATE/MOVE/DELETE of a slot. 10783 + * 10784 + * For a memslot with dirty logging disabled: 10785 + * CREATE: No dirty mappings will already exist. 10786 + * MOVE/DELETE: The old mappings will already have been cleaned up by 10787 + * kvm_arch_flush_shadow_memslot() 10788 + * 10789 + * For a memslot with dirty logging enabled: 10790 + * CREATE: No shadow pages exist, thus nothing to write-protect 10791 + * and no dirty bits to clear. 10792 + * MOVE/DELETE: The old mappings will already have been cleaned up by 10793 + * kvm_arch_flush_shadow_memslot(). 10770 10794 */ 10771 10795 if ((change != KVM_MR_FLAGS_ONLY) || (new->flags & KVM_MEM_READONLY)) 10772 10796 return; 10773 10797 10774 10798 /* 10775 - * Dirty logging tracks sptes in 4k granularity, meaning that large 10776 - * sptes have to be split. If live migration is successful, the guest 10777 - * in the source machine will be destroyed and large sptes will be 10778 - * created in the destination. However, if the guest continues to run 10779 - * in the source machine (for example if live migration fails), small 10780 - * sptes will remain around and cause bad performance. 10781 - * 10782 - * Scan sptes if dirty logging has been stopped, dropping those 10783 - * which can be collapsed into a single large-page spte. Later 10784 - * page faults will create the large-page sptes. 10785 - * 10786 - * There is no need to do this in any of the following cases: 10787 - * CREATE: No dirty mappings will already exist. 10788 - * MOVE/DELETE: The old mappings will already have been cleaned up by 10789 - * kvm_arch_flush_shadow_memslot() 10799 + * READONLY and non-flags changes were filtered out above, and the only 10800 + * other flag is LOG_DIRTY_PAGES, i.e. something is wrong if dirty 10801 + * logging isn't being toggled on or off. 10790 10802 */ 10791 - if ((old->flags & KVM_MEM_LOG_DIRTY_PAGES) && 10792 - !(new->flags & KVM_MEM_LOG_DIRTY_PAGES)) 10803 + if (WARN_ON_ONCE(!((old->flags ^ new->flags) & KVM_MEM_LOG_DIRTY_PAGES))) 10804 + return; 10805 + 10806 + if (!log_dirty_pages) { 10807 + /* 10808 + * Dirty logging tracks sptes in 4k granularity, meaning that 10809 + * large sptes have to be split. If live migration succeeds, 10810 + * the guest in the source machine will be destroyed and large 10811 + * sptes will be created in the destination. However, if the 10812 + * guest continues to run in the source machine (for example if 10813 + * live migration fails), small sptes will remain around and 10814 + * cause bad performance. 10815 + * 10816 + * Scan sptes if dirty logging has been stopped, dropping those 10817 + * which can be collapsed into a single large-page spte. Later 10818 + * page faults will create the large-page sptes. 10819 + */ 10793 10820 kvm_mmu_zap_collapsible_sptes(kvm, new); 10821 + } else { 10822 + /* By default, write-protect everything to log writes. */ 10823 + int level = PG_LEVEL_4K; 10794 10824 10795 - /* 10796 - * Enable or disable dirty logging for the slot. 10797 - * 10798 - * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of the old 10799 - * slot have been zapped so no dirty logging updates are needed for 10800 - * the old slot. 10801 - * For KVM_MR_CREATE and KVM_MR_MOVE, once the new slot is visible 10802 - * any mappings that might be created in it will consume the 10803 - * properties of the new slot and do not need to be updated here. 10804 - * 10805 - * When PML is enabled, the kvm_x86_ops dirty logging hooks are 10806 - * called to enable/disable dirty logging. 10807 - * 10808 - * When disabling dirty logging with PML enabled, the D-bit is set 10809 - * for sptes in the slot in order to prevent unnecessary GPA 10810 - * logging in the PML buffer (and potential PML buffer full VMEXIT). 10811 - * This guarantees leaving PML enabled for the guest's lifetime 10812 - * won't have any additional overhead from PML when the guest is 10813 - * running with dirty logging disabled. 10814 - * 10815 - * When enabling dirty logging, large sptes are write-protected 10816 - * so they can be split on first write. New large sptes cannot 10817 - * be created for this slot until the end of the logging. 10818 - * See the comments in fast_page_fault(). 10819 - * For small sptes, nothing is done if the dirty log is in the 10820 - * initial-all-set state. Otherwise, depending on whether pml 10821 - * is enabled the D-bit or the W-bit will be cleared. 10822 - */ 10823 - if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { 10824 - if (kvm_x86_ops.slot_enable_log_dirty) { 10825 - static_call(kvm_x86_slot_enable_log_dirty)(kvm, new); 10826 - } else { 10827 - int level = 10828 - kvm_dirty_log_manual_protect_and_init_set(kvm) ? 10829 - PG_LEVEL_2M : PG_LEVEL_4K; 10825 + if (kvm_x86_ops.cpu_dirty_log_size) { 10826 + /* 10827 + * Clear all dirty bits, unless pages are treated as 10828 + * dirty from the get-go. 10829 + */ 10830 + if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) 10831 + kvm_mmu_slot_leaf_clear_dirty(kvm, new); 10830 10832 10833 + /* 10834 + * Write-protect large pages on write so that dirty 10835 + * logging happens at 4k granularity. No need to 10836 + * write-protect small SPTEs since write accesses are 10837 + * logged by the CPU via dirty bits. 10838 + */ 10839 + level = PG_LEVEL_2M; 10840 + } else if (kvm_dirty_log_manual_protect_and_init_set(kvm)) { 10831 10841 /* 10832 10842 * If we're with initial-all-set, we don't need 10833 10843 * to write protect any small page because ··· 10857 10825 * so that the page split can happen lazily on 10858 10826 * the first write to the huge page. 10859 10827 */ 10860 - kvm_mmu_slot_remove_write_access(kvm, new, level); 10828 + level = PG_LEVEL_2M; 10861 10829 } 10862 - } else { 10863 - static_call_cond(kvm_x86_slot_disable_log_dirty)(kvm, new); 10830 + kvm_mmu_slot_remove_write_access(kvm, new, level); 10864 10831 } 10865 10832 } 10866 10833

+24 -1

include/linux/kvm_host.h

··· 11 11 #include <linux/signal.h> 12 12 #include <linux/sched.h> 13 13 #include <linux/bug.h> 14 + #include <linux/minmax.h> 14 15 #include <linux/mm.h> 15 16 #include <linux/mmu_notifier.h> 16 17 #include <linux/preempt.h> ··· 507 506 struct mmu_notifier mmu_notifier; 508 507 unsigned long mmu_notifier_seq; 509 508 long mmu_notifier_count; 509 + unsigned long mmu_notifier_range_start; 510 + unsigned long mmu_notifier_range_end; 510 511 #endif 511 512 long tlbs_dirty; 512 513 struct list_head devices; ··· 736 733 kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn); 737 734 kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, 738 735 bool atomic, bool *async, bool write_fault, 739 - bool *writable); 736 + bool *writable, hva_t *hva); 740 737 741 738 void kvm_release_pfn_clean(kvm_pfn_t pfn); 742 739 void kvm_release_pfn_dirty(kvm_pfn_t pfn); ··· 1206 1203 * can't rely on kvm->mmu_lock to keep things ordered. 1207 1204 */ 1208 1205 smp_rmb(); 1206 + if (kvm->mmu_notifier_seq != mmu_seq) 1207 + return 1; 1208 + return 0; 1209 + } 1210 + 1211 + static inline int mmu_notifier_retry_hva(struct kvm *kvm, 1212 + unsigned long mmu_seq, 1213 + unsigned long hva) 1214 + { 1215 + lockdep_assert_held(&kvm->mmu_lock); 1216 + /* 1217 + * If mmu_notifier_count is non-zero, then the range maintained by 1218 + * kvm_mmu_notifier_invalidate_range_start contains all addresses that 1219 + * might be being invalidated. Note that it may include some false 1220 + * positives, due to shortcuts when handing concurrent invalidations. 1221 + */ 1222 + if (unlikely(kvm->mmu_notifier_count) && 1223 + hva >= kvm->mmu_notifier_range_start && 1224 + hva < kvm->mmu_notifier_range_end) 1225 + return 1; 1209 1226 if (kvm->mmu_notifier_seq != mmu_seq) 1210 1227 return 1; 1211 1228 return 0;

+1

tools/testing/selftests/kvm/.gitignore

··· 33 33 /demand_paging_test 34 34 /dirty_log_test 35 35 /dirty_log_perf_test 36 + /hardware_disable_test 36 37 /kvm_create_max_vcpus 37 38 /memslot_modification_stress_test 38 39 /set_memory_region_test

+1

tools/testing/selftests/kvm/Makefile

··· 67 67 TEST_GEN_PROGS_x86_64 += demand_paging_test 68 68 TEST_GEN_PROGS_x86_64 += dirty_log_test 69 69 TEST_GEN_PROGS_x86_64 += dirty_log_perf_test 70 + TEST_GEN_PROGS_x86_64 += hardware_disable_test 70 71 TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus 71 72 TEST_GEN_PROGS_x86_64 += memslot_modification_stress_test 72 73 TEST_GEN_PROGS_x86_64 += set_memory_region_test

+165

tools/testing/selftests/kvm/hardware_disable_test.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * This test is intended to reproduce a crash that happens when 4 + * kvm_arch_hardware_disable is called and it attempts to unregister the user 5 + * return notifiers. 6 + */ 7 + 8 + #define _GNU_SOURCE 9 + 10 + #include <fcntl.h> 11 + #include <pthread.h> 12 + #include <semaphore.h> 13 + #include <stdint.h> 14 + #include <stdlib.h> 15 + #include <unistd.h> 16 + #include <sys/wait.h> 17 + 18 + #include <test_util.h> 19 + 20 + #include "kvm_util.h" 21 + 22 + #define VCPU_NUM 4 23 + #define SLEEPING_THREAD_NUM (1 << 4) 24 + #define FORK_NUM (1ULL << 9) 25 + #define DELAY_US_MAX 2000 26 + #define GUEST_CODE_PIO_PORT 4 27 + 28 + sem_t *sem; 29 + 30 + /* Arguments for the pthreads */ 31 + struct payload { 32 + struct kvm_vm *vm; 33 + uint32_t index; 34 + }; 35 + 36 + static void guest_code(void) 37 + { 38 + for (;;) 39 + ; /* Some busy work */ 40 + printf("Should not be reached.\n"); 41 + } 42 + 43 + static void *run_vcpu(void *arg) 44 + { 45 + struct payload *payload = (struct payload *)arg; 46 + struct kvm_run *state = vcpu_state(payload->vm, payload->index); 47 + 48 + vcpu_run(payload->vm, payload->index); 49 + 50 + TEST_ASSERT(false, "%s: exited with reason %d: %s\n", 51 + __func__, state->exit_reason, 52 + exit_reason_str(state->exit_reason)); 53 + pthread_exit(NULL); 54 + } 55 + 56 + static void *sleeping_thread(void *arg) 57 + { 58 + int fd; 59 + 60 + while (true) { 61 + fd = open("/dev/null", O_RDWR); 62 + close(fd); 63 + } 64 + TEST_ASSERT(false, "%s: exited\n", __func__); 65 + pthread_exit(NULL); 66 + } 67 + 68 + static inline void check_create_thread(pthread_t *thread, pthread_attr_t *attr, 69 + void *(*f)(void *), void *arg) 70 + { 71 + int r; 72 + 73 + r = pthread_create(thread, attr, f, arg); 74 + TEST_ASSERT(r == 0, "%s: failed to create thread", __func__); 75 + } 76 + 77 + static inline void check_set_affinity(pthread_t thread, cpu_set_t *cpu_set) 78 + { 79 + int r; 80 + 81 + r = pthread_setaffinity_np(thread, sizeof(cpu_set_t), cpu_set); 82 + TEST_ASSERT(r == 0, "%s: failed set affinity", __func__); 83 + } 84 + 85 + static inline void check_join(pthread_t thread, void **retval) 86 + { 87 + int r; 88 + 89 + r = pthread_join(thread, retval); 90 + TEST_ASSERT(r == 0, "%s: failed to join thread", __func__); 91 + } 92 + 93 + static void run_test(uint32_t run) 94 + { 95 + struct kvm_vm *vm; 96 + cpu_set_t cpu_set; 97 + pthread_t threads[VCPU_NUM]; 98 + pthread_t throw_away; 99 + struct payload payloads[VCPU_NUM]; 100 + void *b; 101 + uint32_t i, j; 102 + 103 + CPU_ZERO(&cpu_set); 104 + for (i = 0; i < VCPU_NUM; i++) 105 + CPU_SET(i, &cpu_set); 106 + 107 + vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR); 108 + kvm_vm_elf_load(vm, program_invocation_name, 0, 0); 109 + vm_create_irqchip(vm); 110 + 111 + fprintf(stderr, "%s: [%d] start vcpus\n", __func__, run); 112 + for (i = 0; i < VCPU_NUM; ++i) { 113 + vm_vcpu_add_default(vm, i, guest_code); 114 + payloads[i].vm = vm; 115 + payloads[i].index = i; 116 + 117 + check_create_thread(&threads[i], NULL, run_vcpu, 118 + (void *)&payloads[i]); 119 + check_set_affinity(threads[i], &cpu_set); 120 + 121 + for (j = 0; j < SLEEPING_THREAD_NUM; ++j) { 122 + check_create_thread(&throw_away, NULL, sleeping_thread, 123 + (void *)NULL); 124 + check_set_affinity(throw_away, &cpu_set); 125 + } 126 + } 127 + fprintf(stderr, "%s: [%d] all threads launched\n", __func__, run); 128 + sem_post(sem); 129 + for (i = 0; i < VCPU_NUM; ++i) 130 + check_join(threads[i], &b); 131 + /* Should not be reached */ 132 + TEST_ASSERT(false, "%s: [%d] child escaped the ninja\n", __func__, run); 133 + } 134 + 135 + int main(int argc, char **argv) 136 + { 137 + uint32_t i; 138 + int s, r; 139 + pid_t pid; 140 + 141 + sem = sem_open("vm_sem", O_CREAT | O_EXCL, 0644, 0); 142 + sem_unlink("vm_sem"); 143 + 144 + for (i = 0; i < FORK_NUM; ++i) { 145 + pid = fork(); 146 + TEST_ASSERT(pid >= 0, "%s: unable to fork", __func__); 147 + if (pid == 0) 148 + run_test(i); /* This function always exits */ 149 + 150 + fprintf(stderr, "%s: [%d] waiting semaphore\n", __func__, i); 151 + sem_wait(sem); 152 + r = (rand() % DELAY_US_MAX) + 1; 153 + fprintf(stderr, "%s: [%d] waiting %dus\n", __func__, i, r); 154 + usleep(r); 155 + r = waitpid(pid, &s, WNOHANG); 156 + TEST_ASSERT(r != pid, 157 + "%s: [%d] child exited unexpectedly status: [%d]", 158 + __func__, i, s); 159 + fprintf(stderr, "%s: [%d] killing child\n", __func__, i); 160 + kill(pid, SIGKILL); 161 + } 162 + 163 + sem_destroy(sem); 164 + exit(0); 165 + }

+2 -1

tools/testing/selftests/kvm/lib/x86_64/processor.c

··· 720 720 { 721 721 struct vcpu *vcpu = vcpu_find(vm, vcpuid); 722 722 struct kvm_cpuid2 *cpuid; 723 - int rc, max_ent; 723 + int max_ent; 724 + int rc = -1; 724 725 725 726 TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); 726 727

+25 -4

virt/kvm/kvm_main.c

··· 486 486 * count is also read inside the mmu_lock critical section. 487 487 */ 488 488 kvm->mmu_notifier_count++; 489 + if (likely(kvm->mmu_notifier_count == 1)) { 490 + kvm->mmu_notifier_range_start = range->start; 491 + kvm->mmu_notifier_range_end = range->end; 492 + } else { 493 + /* 494 + * Fully tracking multiple concurrent ranges has dimishing 495 + * returns. Keep things simple and just find the minimal range 496 + * which includes the current and new ranges. As there won't be 497 + * enough information to subtract a range after its invalidate 498 + * completes, any ranges invalidated concurrently will 499 + * accumulate and persist until all outstanding invalidates 500 + * complete. 501 + */ 502 + kvm->mmu_notifier_range_start = 503 + min(kvm->mmu_notifier_range_start, range->start); 504 + kvm->mmu_notifier_range_end = 505 + max(kvm->mmu_notifier_range_end, range->end); 506 + } 489 507 need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end, 490 508 range->flags); 491 509 /* we've to flush the tlb before the pages can be freed */ ··· 2041 2023 2042 2024 kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, 2043 2025 bool atomic, bool *async, bool write_fault, 2044 - bool *writable) 2026 + bool *writable, hva_t *hva) 2045 2027 { 2046 2028 unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); 2029 + 2030 + if (hva) 2031 + *hva = addr; 2047 2032 2048 2033 if (addr == KVM_HVA_ERR_RO_BAD) { 2049 2034 if (writable) ··· 2075 2054 bool *writable) 2076 2055 { 2077 2056 return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL, 2078 - write_fault, writable); 2057 + write_fault, writable, NULL); 2079 2058 } 2080 2059 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); 2081 2060 2082 2061 kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn) 2083 2062 { 2084 - return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL); 2063 + return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, NULL); 2085 2064 } 2086 2065 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot); 2087 2066 2088 2067 kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn) 2089 2068 { 2090 - return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL); 2069 + return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL, NULL); 2091 2070 } 2092 2071 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); 2093 2072