Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

+11 -1

Documentation/virtual/kvm/locking.txt

··· 4 4 1. Acquisition Orders 5 5 --------------------- 6 6 7 - (to be written) 7 + The acquisition orders for mutexes are as follows: 8 + 9 + - kvm->lock is taken outside vcpu->mutex 10 + 11 + - kvm->lock is taken outside kvm->slots_lock and kvm->irq_lock 12 + 13 + - kvm->slots_lock is taken outside kvm->irq_lock, though acquiring 14 + them together is quite rare. 15 + 16 + For spinlocks, kvm_lock is taken outside kvm->mmu_lock. Everything 17 + else is a leaf: no other lock is taken inside the critical sections. 8 18 9 19 2: Exception 10 20 ------------

+4 -3

arch/mips/include/asm/kvm_host.h

··· 293 293 /* Host KSEG0 address of the EI/DI offset */ 294 294 void *kseg0_commpage; 295 295 296 - u32 io_gpr; /* GPR used as IO source/target */ 296 + /* Resume PC after MMIO completion */ 297 + unsigned long io_pc; 298 + /* GPR used as IO source/target */ 299 + u32 io_gpr; 297 300 298 301 struct hrtimer comparecount_timer; 299 302 /* Count timer control KVM register */ ··· 317 314 318 315 /* Bitmask of pending exceptions to be cleared */ 319 316 unsigned long pending_exceptions_clr; 320 - 321 - u32 pending_load_cause; 322 317 323 318 /* Save/Restore the entryhi register when are are preempted/scheduled back in */ 324 319 unsigned long preempt_entryhi;

+19 -13

arch/mips/kvm/emulate.c

··· 790 790 struct mips_coproc *cop0 = vcpu->arch.cop0; 791 791 enum emulation_result er = EMULATE_DONE; 792 792 793 - if (kvm_read_c0_guest_status(cop0) & ST0_EXL) { 793 + if (kvm_read_c0_guest_status(cop0) & ST0_ERL) { 794 + kvm_clear_c0_guest_status(cop0, ST0_ERL); 795 + vcpu->arch.pc = kvm_read_c0_guest_errorepc(cop0); 796 + } else if (kvm_read_c0_guest_status(cop0) & ST0_EXL) { 794 797 kvm_debug("[%#lx] ERET to %#lx\n", vcpu->arch.pc, 795 798 kvm_read_c0_guest_epc(cop0)); 796 799 kvm_clear_c0_guest_status(cop0, ST0_EXL); 797 800 vcpu->arch.pc = kvm_read_c0_guest_epc(cop0); 798 801 799 - } else if (kvm_read_c0_guest_status(cop0) & ST0_ERL) { 800 - kvm_clear_c0_guest_status(cop0, ST0_ERL); 801 - vcpu->arch.pc = kvm_read_c0_guest_errorepc(cop0); 802 802 } else { 803 803 kvm_err("[%#lx] ERET when MIPS_SR_EXL|MIPS_SR_ERL == 0\n", 804 804 vcpu->arch.pc); ··· 1528 1528 struct kvm_vcpu *vcpu) 1529 1529 { 1530 1530 enum emulation_result er = EMULATE_DO_MMIO; 1531 + unsigned long curr_pc; 1531 1532 u32 op, rt; 1532 1533 u32 bytes; 1533 1534 1534 1535 rt = inst.i_format.rt; 1535 1536 op = inst.i_format.opcode; 1536 1537 1537 - vcpu->arch.pending_load_cause = cause; 1538 + /* 1539 + * Find the resume PC now while we have safe and easy access to the 1540 + * prior branch instruction, and save it for 1541 + * kvm_mips_complete_mmio_load() to restore later. 1542 + */ 1543 + curr_pc = vcpu->arch.pc; 1544 + er = update_pc(vcpu, cause); 1545 + if (er == EMULATE_FAIL) 1546 + return er; 1547 + vcpu->arch.io_pc = vcpu->arch.pc; 1548 + vcpu->arch.pc = curr_pc; 1549 + 1538 1550 vcpu->arch.io_gpr = rt; 1539 1551 1540 1552 switch (op) { ··· 2506 2494 goto done; 2507 2495 } 2508 2496 2509 - er = update_pc(vcpu, vcpu->arch.pending_load_cause); 2510 - if (er == EMULATE_FAIL) 2511 - return er; 2497 + /* Restore saved resume PC */ 2498 + vcpu->arch.pc = vcpu->arch.io_pc; 2512 2499 2513 2500 switch (run->mmio.len) { 2514 2501 case 4: ··· 2528 2517 *gpr = *(u8 *) run->mmio.data; 2529 2518 break; 2530 2519 } 2531 - 2532 - if (vcpu->arch.pending_load_cause & CAUSEF_BD) 2533 - kvm_debug("[%#lx] Completing %d byte BD Load to gpr %d (0x%08lx) type %d\n", 2534 - vcpu->arch.pc, run->mmio.len, vcpu->arch.io_gpr, *gpr, 2535 - vcpu->mmio_needed); 2536 2520 2537 2521 done: 2538 2522 return er;

+4 -1

arch/mips/kvm/mips.c

··· 426 426 static void kvm_mips_check_asids(struct kvm_vcpu *vcpu) 427 427 { 428 428 struct mips_coproc *cop0 = vcpu->arch.cop0; 429 - int cpu = smp_processor_id(); 429 + int i, cpu = smp_processor_id(); 430 430 unsigned int gasid; 431 431 432 432 /* ··· 442 442 vcpu); 443 443 vcpu->arch.guest_user_asid[cpu] = 444 444 vcpu->arch.guest_user_mm.context.asid[cpu]; 445 + for_each_possible_cpu(i) 446 + if (i != cpu) 447 + vcpu->arch.guest_user_asid[cpu] = 0; 445 448 vcpu->arch.last_user_gasid = gasid; 446 449 } 447 450 }

-4

arch/mips/kvm/mmu.c

··· 260 260 261 261 if ((vcpu->arch.guest_user_asid[cpu] ^ asid_cache(cpu)) & 262 262 asid_version_mask(cpu)) { 263 - u32 gasid = kvm_read_c0_guest_entryhi(vcpu->arch.cop0) & 264 - KVM_ENTRYHI_ASID; 265 - 266 263 kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu, vcpu); 267 264 vcpu->arch.guest_user_asid[cpu] = 268 265 vcpu->arch.guest_user_mm.context.asid[cpu]; 269 - vcpu->arch.last_user_gasid = gasid; 270 266 newasid++; 271 267 272 268 kvm_debug("[%d]: cpu_context: %#lx\n", cpu,

+2 -2

arch/s390/kvm/sthyi.c

··· 315 315 if (r < 0) 316 316 goto out; 317 317 318 - diag224_buf = kmalloc(PAGE_SIZE, GFP_KERNEL | GFP_DMA); 318 + diag224_buf = (void *)__get_free_page(GFP_KERNEL | GFP_DMA); 319 319 if (!diag224_buf || diag224(diag224_buf)) 320 320 goto out; 321 321 ··· 378 378 sctns->par.infpval1 |= PAR_WGHT_VLD; 379 379 380 380 out: 381 - kfree(diag224_buf); 381 + free_page((unsigned long)diag224_buf); 382 382 vfree(diag204_buf); 383 383 } 384 384

-3

arch/x86/include/asm/kvm_host.h

··· 948 948 int (*get_lpage_level)(void); 949 949 bool (*rdtscp_supported)(void); 950 950 bool (*invpcid_supported)(void); 951 - void (*adjust_tsc_offset_guest)(struct kvm_vcpu *vcpu, s64 adjustment); 952 951 953 952 void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); 954 953 ··· 956 957 bool (*has_wbinvd_exit)(void); 957 958 958 959 void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); 959 - 960 - u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu, u64 host_tsc); 961 960 962 961 void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); 963 962

+1 -1

arch/x86/kvm/emulate.c

··· 5045 5045 /* Decode and fetch the destination operand: register or memory. */ 5046 5046 rc = decode_operand(ctxt, &ctxt->dst, (ctxt->d >> DstShift) & OpMask); 5047 5047 5048 - if (ctxt->rip_relative) 5048 + if (ctxt->rip_relative && likely(ctxt->memopp)) 5049 5049 ctxt->memopp->addr.mem.ea = address_mask(ctxt, 5050 5050 ctxt->memopp->addr.mem.ea + ctxt->_eip); 5051 5051

-23

arch/x86/kvm/svm.c

··· 1138 1138 mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 1139 1139 } 1140 1140 1141 - static void svm_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) 1142 - { 1143 - struct vcpu_svm *svm = to_svm(vcpu); 1144 - 1145 - svm->vmcb->control.tsc_offset += adjustment; 1146 - if (is_guest_mode(vcpu)) 1147 - svm->nested.hsave->control.tsc_offset += adjustment; 1148 - else 1149 - trace_kvm_write_tsc_offset(vcpu->vcpu_id, 1150 - svm->vmcb->control.tsc_offset - adjustment, 1151 - svm->vmcb->control.tsc_offset); 1152 - 1153 - mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 1154 - } 1155 - 1156 1141 static void avic_init_vmcb(struct vcpu_svm *svm) 1157 1142 { 1158 1143 struct vmcb *vmcb = svm->vmcb; ··· 3434 3449 return 0; 3435 3450 } 3436 3451 3437 - static u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) 3438 - { 3439 - struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu)); 3440 - return vmcb->control.tsc_offset + host_tsc; 3441 - } 3442 - 3443 3452 static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 3444 3453 { 3445 3454 struct vcpu_svm *svm = to_svm(vcpu); ··· 5401 5422 .has_wbinvd_exit = svm_has_wbinvd_exit, 5402 5423 5403 5424 .write_tsc_offset = svm_write_tsc_offset, 5404 - .adjust_tsc_offset_guest = svm_adjust_tsc_offset_guest, 5405 - .read_l1_tsc = svm_read_l1_tsc, 5406 5425 5407 5426 .set_tdp_cr3 = set_tdp_cr3, 5408 5427

+20 -45

arch/x86/kvm/vmx.c

··· 187 187 */ 188 188 struct loaded_vmcs { 189 189 struct vmcs *vmcs; 190 + struct vmcs *shadow_vmcs; 190 191 int cpu; 191 192 int launched; 192 193 struct list_head loaded_vmcss_on_cpu_link; ··· 412 411 * memory during VMXOFF, VMCLEAR, VMPTRLD. 413 412 */ 414 413 struct vmcs12 *cached_vmcs12; 415 - struct vmcs *current_shadow_vmcs; 416 414 /* 417 415 * Indicates if the shadow vmcs must be updated with the 418 416 * data hold by vmcs12 ··· 421 421 /* vmcs02_list cache of VMCSs recently used to run L2 guests */ 422 422 struct list_head vmcs02_pool; 423 423 int vmcs02_num; 424 - u64 vmcs01_tsc_offset; 425 424 bool change_vmcs01_virtual_x2apic_mode; 426 425 /* L2 must run next, and mustn't decide to exit to L1. */ 427 426 bool nested_run_pending; ··· 1418 1419 static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) 1419 1420 { 1420 1421 vmcs_clear(loaded_vmcs->vmcs); 1422 + if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched) 1423 + vmcs_clear(loaded_vmcs->shadow_vmcs); 1421 1424 loaded_vmcs->cpu = -1; 1422 1425 loaded_vmcs->launched = 0; 1423 1426 } ··· 2606 2605 } 2607 2606 2608 2607 /* 2609 - * Like guest_read_tsc, but always returns L1's notion of the timestamp 2610 - * counter, even if a nested guest (L2) is currently running. 2611 - */ 2612 - static u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) 2613 - { 2614 - u64 tsc_offset; 2615 - 2616 - tsc_offset = is_guest_mode(vcpu) ? 2617 - to_vmx(vcpu)->nested.vmcs01_tsc_offset : 2618 - vmcs_read64(TSC_OFFSET); 2619 - return host_tsc + tsc_offset; 2620 - } 2621 - 2622 - /* 2623 2608 * writes 'offset' into guest's timestamp counter offset register 2624 2609 */ 2625 2610 static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) ··· 2618 2631 * to the newly set TSC to get L2's TSC. 2619 2632 */ 2620 2633 struct vmcs12 *vmcs12; 2621 - to_vmx(vcpu)->nested.vmcs01_tsc_offset = offset; 2622 2634 /* recalculate vmcs02.TSC_OFFSET: */ 2623 2635 vmcs12 = get_vmcs12(vcpu); 2624 2636 vmcs_write64(TSC_OFFSET, offset + ··· 2628 2642 vmcs_read64(TSC_OFFSET), offset); 2629 2643 vmcs_write64(TSC_OFFSET, offset); 2630 2644 } 2631 - } 2632 - 2633 - static void vmx_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) 2634 - { 2635 - u64 offset = vmcs_read64(TSC_OFFSET); 2636 - 2637 - vmcs_write64(TSC_OFFSET, offset + adjustment); 2638 - if (is_guest_mode(vcpu)) { 2639 - /* Even when running L2, the adjustment needs to apply to L1 */ 2640 - to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment; 2641 - } else 2642 - trace_kvm_write_tsc_offset(vcpu->vcpu_id, offset, 2643 - offset + adjustment); 2644 2645 } 2645 2646 2646 2647 static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu) ··· 3535 3562 loaded_vmcs_clear(loaded_vmcs); 3536 3563 free_vmcs(loaded_vmcs->vmcs); 3537 3564 loaded_vmcs->vmcs = NULL; 3565 + WARN_ON(loaded_vmcs->shadow_vmcs != NULL); 3538 3566 } 3539 3567 3540 3568 static void free_kvm_area(void) ··· 6670 6696 if (!item) 6671 6697 return NULL; 6672 6698 item->vmcs02.vmcs = alloc_vmcs(); 6699 + item->vmcs02.shadow_vmcs = NULL; 6673 6700 if (!item->vmcs02.vmcs) { 6674 6701 kfree(item); 6675 6702 return NULL; ··· 7047 7072 shadow_vmcs->revision_id |= (1u << 31); 7048 7073 /* init shadow vmcs */ 7049 7074 vmcs_clear(shadow_vmcs); 7050 - vmx->nested.current_shadow_vmcs = shadow_vmcs; 7075 + vmx->vmcs01.shadow_vmcs = shadow_vmcs; 7051 7076 } 7052 7077 7053 7078 INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); ··· 7149 7174 free_page((unsigned long)vmx->nested.msr_bitmap); 7150 7175 vmx->nested.msr_bitmap = NULL; 7151 7176 } 7152 - if (enable_shadow_vmcs) 7153 - free_vmcs(vmx->nested.current_shadow_vmcs); 7177 + if (enable_shadow_vmcs) { 7178 + vmcs_clear(vmx->vmcs01.shadow_vmcs); 7179 + free_vmcs(vmx->vmcs01.shadow_vmcs); 7180 + vmx->vmcs01.shadow_vmcs = NULL; 7181 + } 7154 7182 kfree(vmx->nested.cached_vmcs12); 7155 7183 /* Unpin physical memory we referred to in current vmcs02 */ 7156 7184 if (vmx->nested.apic_access_page) { ··· 7330 7352 int i; 7331 7353 unsigned long field; 7332 7354 u64 field_value; 7333 - struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs; 7355 + struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 7334 7356 const unsigned long *fields = shadow_read_write_fields; 7335 7357 const int num_fields = max_shadow_read_write_fields; 7336 7358 ··· 7379 7401 int i, q; 7380 7402 unsigned long field; 7381 7403 u64 field_value = 0; 7382 - struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs; 7404 + struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 7383 7405 7384 7406 vmcs_load(shadow_vmcs); 7385 7407 ··· 7569 7591 vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, 7570 7592 SECONDARY_EXEC_SHADOW_VMCS); 7571 7593 vmcs_write64(VMCS_LINK_POINTER, 7572 - __pa(vmx->nested.current_shadow_vmcs)); 7594 + __pa(vmx->vmcs01.shadow_vmcs)); 7573 7595 vmx->nested.sync_shadow_vmcs = true; 7574 7596 } 7575 7597 } ··· 7637 7659 7638 7660 types = (vmx->nested.nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; 7639 7661 7640 - if (!(types & (1UL << type))) { 7662 + if (type >= 32 || !(types & (1 << type))) { 7641 7663 nested_vmx_failValid(vcpu, 7642 7664 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 7643 7665 skip_emulated_instruction(vcpu); ··· 7700 7722 7701 7723 types = (vmx->nested.nested_vmx_vpid_caps >> 8) & 0x7; 7702 7724 7703 - if (!(types & (1UL << type))) { 7725 + if (type >= 32 || !(types & (1 << type))) { 7704 7726 nested_vmx_failValid(vcpu, 7705 7727 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 7706 7728 skip_emulated_instruction(vcpu); ··· 9134 9156 9135 9157 vmx->loaded_vmcs = &vmx->vmcs01; 9136 9158 vmx->loaded_vmcs->vmcs = alloc_vmcs(); 9159 + vmx->loaded_vmcs->shadow_vmcs = NULL; 9137 9160 if (!vmx->loaded_vmcs->vmcs) 9138 9161 goto free_msrs; 9139 9162 if (!vmm_exclusive) ··· 10040 10061 10041 10062 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) 10042 10063 vmcs_write64(TSC_OFFSET, 10043 - vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset); 10064 + vcpu->arch.tsc_offset + vmcs12->tsc_offset); 10044 10065 else 10045 - vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset); 10066 + vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 10046 10067 if (kvm_has_tsc_control) 10047 10068 decache_tsc_multiplier(vmx); 10048 10069 ··· 10271 10292 return -ENOMEM; 10272 10293 10273 10294 enter_guest_mode(vcpu); 10274 - 10275 - vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET); 10276 10295 10277 10296 if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 10278 10297 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); ··· 10795 10818 load_vmcs12_host_state(vcpu, vmcs12); 10796 10819 10797 10820 /* Update any VMCS fields that might have changed while L2 ran */ 10798 - vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset); 10821 + vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 10799 10822 if (vmx->hv_deadline_tsc == -1) 10800 10823 vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, 10801 10824 PIN_BASED_VMX_PREEMPTION_TIMER); ··· 11316 11339 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, 11317 11340 11318 11341 .write_tsc_offset = vmx_write_tsc_offset, 11319 - .adjust_tsc_offset_guest = vmx_adjust_tsc_offset_guest, 11320 - .read_l1_tsc = vmx_read_l1_tsc, 11321 11342 11322 11343 .set_tdp_cr3 = vmx_set_cr3, 11323 11344

+9 -7

arch/x86/kvm/x86.c

··· 1409 1409 1410 1410 u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) 1411 1411 { 1412 - return kvm_x86_ops->read_l1_tsc(vcpu, kvm_scale_tsc(vcpu, host_tsc)); 1412 + return vcpu->arch.tsc_offset + kvm_scale_tsc(vcpu, host_tsc); 1413 1413 } 1414 1414 EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); 1415 1415 ··· 1547 1547 static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, 1548 1548 s64 adjustment) 1549 1549 { 1550 - kvm_x86_ops->adjust_tsc_offset_guest(vcpu, adjustment); 1550 + kvm_vcpu_write_tsc_offset(vcpu, vcpu->arch.tsc_offset + adjustment); 1551 1551 } 1552 1552 1553 1553 static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) ··· 1555 1555 if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio) 1556 1556 WARN_ON(adjustment < 0); 1557 1557 adjustment = kvm_scale_tsc(vcpu, (u64) adjustment); 1558 - kvm_x86_ops->adjust_tsc_offset_guest(vcpu, adjustment); 1558 + adjust_tsc_offset_guest(vcpu, adjustment); 1559 1559 } 1560 1560 1561 1561 #ifdef CONFIG_X86_64 ··· 2262 2262 /* Drop writes to this legacy MSR -- see rdmsr 2263 2263 * counterpart for further detail. 2264 2264 */ 2265 - vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); 2265 + vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n", msr, data); 2266 2266 break; 2267 2267 case MSR_AMD64_OSVW_ID_LENGTH: 2268 2268 if (!guest_cpuid_has_osvw(vcpu)) ··· 2280 2280 if (kvm_pmu_is_valid_msr(vcpu, msr)) 2281 2281 return kvm_pmu_set_msr(vcpu, msr_info); 2282 2282 if (!ignore_msrs) { 2283 - vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", 2283 + vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n", 2284 2284 msr, data); 2285 2285 return 1; 2286 2286 } else { 2287 - vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", 2287 + vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n", 2288 2288 msr, data); 2289 2289 break; 2290 2290 } ··· 7410 7410 7411 7411 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) 7412 7412 { 7413 + void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask; 7414 + 7413 7415 kvmclock_reset(vcpu); 7414 7416 7415 - free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); 7416 7417 kvm_x86_ops->vcpu_free(vcpu); 7418 + free_cpumask_var(wbinvd_dirty_mask); 7417 7419 } 7418 7420 7419 7421 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,

+19 -3

virt/kvm/eventfd.c

··· 42 42 43 43 #ifdef CONFIG_HAVE_KVM_IRQFD 44 44 45 + static struct workqueue_struct *irqfd_cleanup_wq; 45 46 46 47 static void 47 48 irqfd_inject(struct work_struct *work) ··· 168 167 169 168 list_del_init(&irqfd->list); 170 169 171 - schedule_work(&irqfd->shutdown); 170 + queue_work(irqfd_cleanup_wq, &irqfd->shutdown); 172 171 } 173 172 174 173 int __attribute__((weak)) kvm_arch_set_irq_inatomic( ··· 555 554 * so that we guarantee there will not be any more interrupts on this 556 555 * gsi once this deassign function returns. 557 556 */ 558 - flush_work(&irqfd->shutdown); 557 + flush_workqueue(irqfd_cleanup_wq); 559 558 560 559 return 0; 561 560 } ··· 592 591 * Block until we know all outstanding shutdown jobs have completed 593 592 * since we do not take a kvm* reference. 594 593 */ 595 - flush_work(&irqfd->shutdown); 594 + flush_workqueue(irqfd_cleanup_wq); 596 595 597 596 } 598 597 ··· 622 621 spin_unlock_irq(&kvm->irqfds.lock); 623 622 } 624 623 624 + /* 625 + * create a host-wide workqueue for issuing deferred shutdown requests 626 + * aggregated from all vm* instances. We need our own isolated 627 + * queue to ease flushing work items when a VM exits. 628 + */ 629 + int kvm_irqfd_init(void) 630 + { 631 + irqfd_cleanup_wq = alloc_workqueue("kvm-irqfd-cleanup", 0, 0); 632 + if (!irqfd_cleanup_wq) 633 + return -ENOMEM; 634 + 635 + return 0; 636 + } 637 + 625 638 void kvm_irqfd_exit(void) 626 639 { 640 + destroy_workqueue(irqfd_cleanup_wq); 627 641 } 628 642 #endif 629 643

+6

virt/kvm/kvm_main.c

··· 3844 3844 * kvm_arch_init makes sure there's at most one caller 3845 3845 * for architectures that support multiple implementations, 3846 3846 * like intel and amd on x86. 3847 + * kvm_arch_init must be called before kvm_irqfd_init to avoid creating 3848 + * conflicts in case kvm is already setup for another implementation. 3847 3849 */ 3850 + r = kvm_irqfd_init(); 3851 + if (r) 3852 + goto out_irqfd; 3848 3853 3849 3854 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { 3850 3855 r = -ENOMEM; ··· 3931 3926 free_cpumask_var(cpus_hardware_enabled); 3932 3927 out_free_0: 3933 3928 kvm_irqfd_exit(); 3929 + out_irqfd: 3934 3930 kvm_arch_exit(); 3935 3931 out_fail: 3936 3932 return r;