Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

KVM: Update Posted-Interrupts Descriptor when vCPU is blocked

This patch updates the Posted-Interrupts Descriptor when vCPU
is blocked.

pre-block:
- Add the vCPU to the blocked per-CPU list
- Set 'NV' to POSTED_INTR_WAKEUP_VECTOR

post-block:
- Remove the vCPU from the per-CPU list

Signed-off-by: Feng Wu <feng.wu@intel.com>
[Concentrate invocation of pre/post-block hooks to vcpu_block. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

authored by

Feng Wu and committed by
Paolo Bonzini
bf9f6ac8 28b835d6

+206 -10
+12
Documentation/virtual/kvm/locking.txt
··· 166 166 MMIO/PIO address->device structure mapping (kvm->buses). 167 167 The srcu index can be stored in kvm_vcpu->srcu_idx per vcpu 168 168 if it is needed by multiple functions. 169 + 170 + Name: blocked_vcpu_on_cpu_lock 171 + Type: spinlock_t 172 + Arch: x86 173 + Protects: blocked_vcpu_on_cpu 174 + Comment: This is a per-CPU lock and it is used for VT-d posted-interrupts. 175 + When VT-d posted-interrupts is supported and the VM has assigned 176 + devices, we put the blocked vCPU on the list blocked_vcpu_on_cpu 177 + protected by blocked_vcpu_on_cpu_lock, when VT-d hardware issues 178 + wakeup notification event since external interrupts from the 179 + assigned devices happens, we will find the vCPU on the list to 180 + wakeup.
+11
arch/x86/include/asm/kvm_host.h
··· 899 899 /* pmu operations of sub-arch */ 900 900 const struct kvm_pmu_ops *pmu_ops; 901 901 902 + /* 903 + * Architecture specific hooks for vCPU blocking due to 904 + * HLT instruction. 905 + * Returns for .pre_block(): 906 + * - 0 means continue to block the vCPU. 907 + * - 1 means we cannot block the vCPU since some event 908 + * happens during this period, such as, 'ON' bit in 909 + * posted-interrupts descriptor is set. 910 + */ 911 + int (*pre_block)(struct kvm_vcpu *vcpu); 912 + void (*post_block)(struct kvm_vcpu *vcpu); 902 913 int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq, 903 914 uint32_t guest_irq, bool set); 904 915 };
+153
arch/x86/kvm/vmx.c
··· 878 878 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); 879 879 static DEFINE_PER_CPU(struct desc_ptr, host_gdt); 880 880 881 + /* 882 + * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we 883 + * can find which vCPU should be waken up. 884 + */ 885 + static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); 886 + static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); 887 + 881 888 static unsigned long *vmx_io_bitmap_a; 882 889 static unsigned long *vmx_io_bitmap_b; 883 890 static unsigned long *vmx_msr_bitmap_legacy; ··· 2993 2986 return -EBUSY; 2994 2987 2995 2988 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); 2989 + INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); 2990 + spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); 2996 2991 2997 2992 /* 2998 2993 * Now we can enable the vmclear operation in kdump ··· 6054 6045 ple_window_grow, INT_MIN); 6055 6046 } 6056 6047 6048 + /* 6049 + * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. 6050 + */ 6051 + static void wakeup_handler(void) 6052 + { 6053 + struct kvm_vcpu *vcpu; 6054 + int cpu = smp_processor_id(); 6055 + 6056 + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); 6057 + list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu), 6058 + blocked_vcpu_list) { 6059 + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); 6060 + 6061 + if (pi_test_on(pi_desc) == 1) 6062 + kvm_vcpu_kick(vcpu); 6063 + } 6064 + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); 6065 + } 6066 + 6057 6067 static __init int hardware_setup(void) 6058 6068 { 6059 6069 int r = -ENOMEM, i, msr; ··· 6258 6230 kvm_x86_ops->flush_log_dirty = NULL; 6259 6231 kvm_x86_ops->enable_log_dirty_pt_masked = NULL; 6260 6232 } 6233 + 6234 + kvm_set_posted_intr_wakeup_handler(wakeup_handler); 6261 6235 6262 6236 return alloc_kvm_area(); 6263 6237 ··· 10462 10432 } 10463 10433 10464 10434 /* 10435 + * This routine does the following things for vCPU which is going 10436 + * to be blocked if VT-d PI is enabled. 10437 + * - Store the vCPU to the wakeup list, so when interrupts happen 10438 + * we can find the right vCPU to wake up. 10439 + * - Change the Posted-interrupt descriptor as below: 10440 + * 'NDST' <-- vcpu->pre_pcpu 10441 + * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR 10442 + * - If 'ON' is set during this process, which means at least one 10443 + * interrupt is posted for this vCPU, we cannot block it, in 10444 + * this case, return 1, otherwise, return 0. 10445 + * 10446 + */ 10447 + static int vmx_pre_block(struct kvm_vcpu *vcpu) 10448 + { 10449 + unsigned long flags; 10450 + unsigned int dest; 10451 + struct pi_desc old, new; 10452 + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); 10453 + 10454 + if (!kvm_arch_has_assigned_device(vcpu->kvm) || 10455 + !irq_remapping_cap(IRQ_POSTING_CAP)) 10456 + return 0; 10457 + 10458 + vcpu->pre_pcpu = vcpu->cpu; 10459 + spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock, 10460 + vcpu->pre_pcpu), flags); 10461 + list_add_tail(&vcpu->blocked_vcpu_list, 10462 + &per_cpu(blocked_vcpu_on_cpu, 10463 + vcpu->pre_pcpu)); 10464 + spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock, 10465 + vcpu->pre_pcpu), flags); 10466 + 10467 + do { 10468 + old.control = new.control = pi_desc->control; 10469 + 10470 + /* 10471 + * We should not block the vCPU if 10472 + * an interrupt is posted for it. 10473 + */ 10474 + if (pi_test_on(pi_desc) == 1) { 10475 + spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock, 10476 + vcpu->pre_pcpu), flags); 10477 + list_del(&vcpu->blocked_vcpu_list); 10478 + spin_unlock_irqrestore( 10479 + &per_cpu(blocked_vcpu_on_cpu_lock, 10480 + vcpu->pre_pcpu), flags); 10481 + vcpu->pre_pcpu = -1; 10482 + 10483 + return 1; 10484 + } 10485 + 10486 + WARN((pi_desc->sn == 1), 10487 + "Warning: SN field of posted-interrupts " 10488 + "is set before blocking\n"); 10489 + 10490 + /* 10491 + * Since vCPU can be preempted during this process, 10492 + * vcpu->cpu could be different with pre_pcpu, we 10493 + * need to set pre_pcpu as the destination of wakeup 10494 + * notification event, then we can find the right vCPU 10495 + * to wakeup in wakeup handler if interrupts happen 10496 + * when the vCPU is in blocked state. 10497 + */ 10498 + dest = cpu_physical_id(vcpu->pre_pcpu); 10499 + 10500 + if (x2apic_enabled()) 10501 + new.ndst = dest; 10502 + else 10503 + new.ndst = (dest << 8) & 0xFF00; 10504 + 10505 + /* set 'NV' to 'wakeup vector' */ 10506 + new.nv = POSTED_INTR_WAKEUP_VECTOR; 10507 + } while (cmpxchg(&pi_desc->control, old.control, 10508 + new.control) != old.control); 10509 + 10510 + return 0; 10511 + } 10512 + 10513 + static void vmx_post_block(struct kvm_vcpu *vcpu) 10514 + { 10515 + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); 10516 + struct pi_desc old, new; 10517 + unsigned int dest; 10518 + unsigned long flags; 10519 + 10520 + if (!kvm_arch_has_assigned_device(vcpu->kvm) || 10521 + !irq_remapping_cap(IRQ_POSTING_CAP)) 10522 + return; 10523 + 10524 + do { 10525 + old.control = new.control = pi_desc->control; 10526 + 10527 + dest = cpu_physical_id(vcpu->cpu); 10528 + 10529 + if (x2apic_enabled()) 10530 + new.ndst = dest; 10531 + else 10532 + new.ndst = (dest << 8) & 0xFF00; 10533 + 10534 + /* Allow posting non-urgent interrupts */ 10535 + new.sn = 0; 10536 + 10537 + /* set 'NV' to 'notification vector' */ 10538 + new.nv = POSTED_INTR_VECTOR; 10539 + } while (cmpxchg(&pi_desc->control, old.control, 10540 + new.control) != old.control); 10541 + 10542 + if(vcpu->pre_pcpu != -1) { 10543 + spin_lock_irqsave( 10544 + &per_cpu(blocked_vcpu_on_cpu_lock, 10545 + vcpu->pre_pcpu), flags); 10546 + list_del(&vcpu->blocked_vcpu_list); 10547 + spin_unlock_irqrestore( 10548 + &per_cpu(blocked_vcpu_on_cpu_lock, 10549 + vcpu->pre_pcpu), flags); 10550 + vcpu->pre_pcpu = -1; 10551 + } 10552 + } 10553 + 10554 + /* 10465 10555 * vmx_update_pi_irte - set IRTE for Posted-Interrupts 10466 10556 * 10467 10557 * @kvm: kvm ··· 10771 10621 .slot_disable_log_dirty = vmx_slot_disable_log_dirty, 10772 10622 .flush_log_dirty = vmx_flush_log_dirty, 10773 10623 .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, 10624 + 10625 + .pre_block = vmx_pre_block, 10626 + .post_block = vmx_post_block, 10774 10627 10775 10628 .pmu_ops = &intel_pmu_ops, 10776 10629
+24 -10
arch/x86/kvm/x86.c
··· 6335 6335 } 6336 6336 } 6337 6337 6338 + /* 6339 + * KVM_REQ_EVENT is not set when posted interrupts are set by 6340 + * VT-d hardware, so we have to update RVI unconditionally. 6341 + */ 6342 + if (kvm_lapic_enabled(vcpu)) { 6343 + /* 6344 + * Update architecture specific hints for APIC 6345 + * virtual interrupt delivery. 6346 + */ 6347 + if (kvm_x86_ops->hwapic_irr_update) 6348 + kvm_x86_ops->hwapic_irr_update(vcpu, 6349 + kvm_lapic_find_highest_irr(vcpu)); 6350 + } 6351 + 6338 6352 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { 6339 6353 kvm_apic_accept_events(vcpu); 6340 6354 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { ··· 6365 6351 kvm_x86_ops->enable_irq_window(vcpu); 6366 6352 6367 6353 if (kvm_lapic_enabled(vcpu)) { 6368 - /* 6369 - * Update architecture specific hints for APIC 6370 - * virtual interrupt delivery. 6371 - */ 6372 - if (kvm_x86_ops->hwapic_irr_update) 6373 - kvm_x86_ops->hwapic_irr_update(vcpu, 6374 - kvm_lapic_find_highest_irr(vcpu)); 6375 6354 update_cr8_intercept(vcpu); 6376 6355 kvm_lapic_sync_to_vapic(vcpu); 6377 6356 } ··· 6500 6493 6501 6494 static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) 6502 6495 { 6503 - if (!kvm_arch_vcpu_runnable(vcpu)) { 6496 + if (!kvm_arch_vcpu_runnable(vcpu) && 6497 + (!kvm_x86_ops->pre_block || kvm_x86_ops->pre_block(vcpu) == 0)) { 6504 6498 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 6505 6499 kvm_vcpu_block(vcpu); 6506 6500 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); 6501 + 6502 + if (kvm_x86_ops->post_block) 6503 + kvm_x86_ops->post_block(vcpu); 6504 + 6507 6505 if (!kvm_check_request(KVM_REQ_UNHALT, vcpu)) 6508 6506 return 1; 6509 6507 } ··· 6540 6528 6541 6529 for (;;) { 6542 6530 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && 6543 - !vcpu->arch.apf.halted) 6531 + !vcpu->arch.apf.halted) { 6544 6532 r = vcpu_enter_guest(vcpu); 6545 - else 6533 + } else { 6546 6534 r = vcpu_block(kvm, vcpu); 6535 + } 6536 + 6547 6537 if (r <= 0) 6548 6538 break; 6549 6539
+3
include/linux/kvm_host.h
··· 234 234 unsigned long requests; 235 235 unsigned long guest_debug; 236 236 237 + int pre_pcpu; 238 + struct list_head blocked_vcpu_list; 239 + 237 240 struct mutex mutex; 238 241 struct kvm_run *run; 239 242
+3
virt/kvm/kvm_main.c
··· 230 230 init_waitqueue_head(&vcpu->wq); 231 231 kvm_async_pf_vcpu_init(vcpu); 232 232 233 + vcpu->pre_pcpu = -1; 234 + INIT_LIST_HEAD(&vcpu->blocked_vcpu_list); 235 + 233 236 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 234 237 if (!page) { 235 238 r = -ENOMEM;