Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm fixes from Paolo Bonzini:
"The main change here is a revert of reverts. We recently simplified
some code that was thought unnecessary; however, since then KVM has
grown quite a few cond_resched()s and for that reason the simplified
code is prone to livelocks---one CPUs tries to empty a list of guest
page tables while the others keep adding to them. This adds back the
generation-based zapping of guest page tables, which was not
unnecessary after all.

On top of this, there is a fix for a kernel memory leak and a couple
of s390 fixlets as well"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
KVM: x86/mmu: Reintroduce fast invalidate/zap for flushing memslot
KVM: x86: work around leak of uninitialized stack contents
KVM: nVMX: handle page fault in vmread
KVM: s390: Do not leak kernel stack data in the KVM_S390_INTERRUPT ioctl
KVM: s390: kvm_s390_vm_start_migration: check dirty_bitmap before using it as target for memset()

Changed files
+124 -4
arch
s390
x86
include
kvm
+10
arch/s390/kvm/interrupt.c
··· 1961 1961 case KVM_S390_MCHK: 1962 1962 irq->u.mchk.mcic = s390int->parm64; 1963 1963 break; 1964 + case KVM_S390_INT_PFAULT_INIT: 1965 + irq->u.ext.ext_params = s390int->parm; 1966 + irq->u.ext.ext_params2 = s390int->parm64; 1967 + break; 1968 + case KVM_S390_RESTART: 1969 + case KVM_S390_INT_CLOCK_COMP: 1970 + case KVM_S390_INT_CPU_TIMER: 1971 + break; 1972 + default: 1973 + return -EINVAL; 1964 1974 } 1965 1975 return 0; 1966 1976 }
+3 -1
arch/s390/kvm/kvm-s390.c
··· 1018 1018 /* mark all the pages in active slots as dirty */ 1019 1019 for (slotnr = 0; slotnr < slots->used_slots; slotnr++) { 1020 1020 ms = slots->memslots + slotnr; 1021 + if (!ms->dirty_bitmap) 1022 + return -EINVAL; 1021 1023 /* 1022 1024 * The second half of the bitmap is only used on x86, 1023 1025 * and would be wasted otherwise, so we put it to good ··· 4325 4323 } 4326 4324 case KVM_S390_INTERRUPT: { 4327 4325 struct kvm_s390_interrupt s390int; 4328 - struct kvm_s390_irq s390irq; 4326 + struct kvm_s390_irq s390irq = {}; 4329 4327 4330 4328 if (copy_from_user(&s390int, argp, sizeof(s390int))) 4331 4329 return -EFAULT;
+2
arch/x86/include/asm/kvm_host.h
··· 335 335 int root_count; /* Currently serving as active root */ 336 336 unsigned int unsync_children; 337 337 struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ 338 + unsigned long mmu_valid_gen; 338 339 DECLARE_BITMAP(unsync_child_bitmap, 512); 339 340 340 341 #ifdef CONFIG_X86_32 ··· 857 856 unsigned long n_requested_mmu_pages; 858 857 unsigned long n_max_mmu_pages; 859 858 unsigned int indirect_shadow_pages; 859 + unsigned long mmu_valid_gen; 860 860 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; 861 861 /* 862 862 * Hash table of struct kvm_mmu_page.
+99 -2
arch/x86/kvm/mmu.c
··· 2095 2095 if (!direct) 2096 2096 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); 2097 2097 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 2098 + 2099 + /* 2100 + * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages() 2101 + * depends on valid pages being added to the head of the list. See 2102 + * comments in kvm_zap_obsolete_pages(). 2103 + */ 2098 2104 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); 2099 2105 kvm_mod_used_mmu_pages(vcpu->kvm, +1); 2100 2106 return sp; ··· 2250 2244 #define for_each_valid_sp(_kvm, _sp, _gfn) \ 2251 2245 hlist_for_each_entry(_sp, \ 2252 2246 &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ 2253 - if ((_sp)->role.invalid) { \ 2247 + if (is_obsolete_sp((_kvm), (_sp)) || (_sp)->role.invalid) { \ 2254 2248 } else 2255 2249 2256 2250 #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \ ··· 2306 2300 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { } 2307 2301 static void mmu_audit_disable(void) { } 2308 2302 #endif 2303 + 2304 + static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) 2305 + { 2306 + return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); 2307 + } 2309 2308 2310 2309 static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 2311 2310 struct list_head *invalid_list) ··· 2536 2525 if (level > PT_PAGE_TABLE_LEVEL && need_sync) 2537 2526 flush |= kvm_sync_pages(vcpu, gfn, &invalid_list); 2538 2527 } 2528 + sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; 2539 2529 clear_page(sp->spt); 2540 2530 trace_kvm_mmu_get_page(sp, true); 2541 2531 ··· 4245 4233 return false; 4246 4234 4247 4235 if (cached_root_available(vcpu, new_cr3, new_role)) { 4236 + /* 4237 + * It is possible that the cached previous root page is 4238 + * obsolete because of a change in the MMU generation 4239 + * number. However, changing the generation number is 4240 + * accompanied by KVM_REQ_MMU_RELOAD, which will free 4241 + * the root set here and allocate a new one. 4242 + */ 4248 4243 kvm_make_request(KVM_REQ_LOAD_CR3, vcpu); 4249 4244 if (!skip_tlb_flush) { 4250 4245 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); ··· 5668 5649 return alloc_mmu_pages(vcpu); 5669 5650 } 5670 5651 5652 + 5653 + static void kvm_zap_obsolete_pages(struct kvm *kvm) 5654 + { 5655 + struct kvm_mmu_page *sp, *node; 5656 + LIST_HEAD(invalid_list); 5657 + int ign; 5658 + 5659 + restart: 5660 + list_for_each_entry_safe_reverse(sp, node, 5661 + &kvm->arch.active_mmu_pages, link) { 5662 + /* 5663 + * No obsolete valid page exists before a newly created page 5664 + * since active_mmu_pages is a FIFO list. 5665 + */ 5666 + if (!is_obsolete_sp(kvm, sp)) 5667 + break; 5668 + 5669 + /* 5670 + * Do not repeatedly zap a root page to avoid unnecessary 5671 + * KVM_REQ_MMU_RELOAD, otherwise we may not be able to 5672 + * progress: 5673 + * vcpu 0 vcpu 1 5674 + * call vcpu_enter_guest(): 5675 + * 1): handle KVM_REQ_MMU_RELOAD 5676 + * and require mmu-lock to 5677 + * load mmu 5678 + * repeat: 5679 + * 1): zap root page and 5680 + * send KVM_REQ_MMU_RELOAD 5681 + * 5682 + * 2): if (cond_resched_lock(mmu-lock)) 5683 + * 5684 + * 2): hold mmu-lock and load mmu 5685 + * 5686 + * 3): see KVM_REQ_MMU_RELOAD bit 5687 + * on vcpu->requests is set 5688 + * then return 1 to call 5689 + * vcpu_enter_guest() again. 5690 + * goto repeat; 5691 + * 5692 + * Since we are reversely walking the list and the invalid 5693 + * list will be moved to the head, skip the invalid page 5694 + * can help us to avoid the infinity list walking. 5695 + */ 5696 + if (sp->role.invalid) 5697 + continue; 5698 + 5699 + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { 5700 + kvm_mmu_commit_zap_page(kvm, &invalid_list); 5701 + cond_resched_lock(&kvm->mmu_lock); 5702 + goto restart; 5703 + } 5704 + 5705 + if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) 5706 + goto restart; 5707 + } 5708 + 5709 + kvm_mmu_commit_zap_page(kvm, &invalid_list); 5710 + } 5711 + 5712 + /* 5713 + * Fast invalidate all shadow pages and use lock-break technique 5714 + * to zap obsolete pages. 5715 + * 5716 + * It's required when memslot is being deleted or VM is being 5717 + * destroyed, in these cases, we should ensure that KVM MMU does 5718 + * not use any resource of the being-deleted slot or all slots 5719 + * after calling the function. 5720 + */ 5721 + static void kvm_mmu_zap_all_fast(struct kvm *kvm) 5722 + { 5723 + spin_lock(&kvm->mmu_lock); 5724 + kvm->arch.mmu_valid_gen++; 5725 + 5726 + kvm_zap_obsolete_pages(kvm); 5727 + spin_unlock(&kvm->mmu_lock); 5728 + } 5729 + 5671 5730 static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, 5672 5731 struct kvm_memory_slot *slot, 5673 5732 struct kvm_page_track_notifier_node *node) 5674 5733 { 5675 - kvm_mmu_zap_all(kvm); 5734 + kvm_mmu_zap_all_fast(kvm); 5676 5735 } 5677 5736 5678 5737 void kvm_mmu_init_vm(struct kvm *kvm)
+3 -1
arch/x86/kvm/vmx/nested.c
··· 4540 4540 int len; 4541 4541 gva_t gva = 0; 4542 4542 struct vmcs12 *vmcs12; 4543 + struct x86_exception e; 4543 4544 short offset; 4544 4545 4545 4546 if (!nested_vmx_check_permission(vcpu)) ··· 4589 4588 vmx_instruction_info, true, len, &gva)) 4590 4589 return 1; 4591 4590 /* _system ok, nested_vmx_check_permission has verified cpl=0 */ 4592 - kvm_write_guest_virt_system(vcpu, gva, &field_value, len, NULL); 4591 + if (kvm_write_guest_virt_system(vcpu, gva, &field_value, len, &e)) 4592 + kvm_inject_page_fault(vcpu, &e); 4593 4593 } 4594 4594 4595 4595 return nested_vmx_succeed(vcpu);
+7
arch/x86/kvm/x86.c
··· 5312 5312 /* kvm_write_guest_virt_system can pull in tons of pages. */ 5313 5313 vcpu->arch.l1tf_flush_l1d = true; 5314 5314 5315 + /* 5316 + * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED 5317 + * is returned, but our callers are not ready for that and they blindly 5318 + * call kvm_inject_page_fault. Ensure that they at least do not leak 5319 + * uninitialized kernel stack memory into cr2 and error code. 5320 + */ 5321 + memset(exception, 0, sizeof(*exception)); 5315 5322 return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, 5316 5323 PFERR_WRITE_MASK, exception); 5317 5324 }