Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

arm/arm64: KVM: Rework the arch timer to use level-triggered semantics

The arch timer currently uses edge-triggered semantics in the sense that
the line is never sampled by the vgic and lowering the line from the
timer to the vgic doesn't have any effect on the pending state of
virtual interrupts in the vgic. This means that we do not support a
guest with the otherwise valid behavior of (1) disable interrupts (2)
enable the timer (3) disable the timer (4) enable interrupts. Such a
guest would validly not expect to see any interrupts on real hardware,
but will see interrupts on KVM.

This patch fixes this shortcoming through the following series of
changes.

First, we change the flow of the timer/vgic sync/flush operations. Now
the timer is always flushed/synced before the vgic, because the vgic
samples the state of the timer output. This has the implication that we
move the timer operations in to non-preempible sections, but that is
fine after the previous commit getting rid of hrtimer schedules on every
entry/exit.

Second, we change the internal behavior of the timer, letting the timer
keep track of its previous output state, and only lower/raise the line
to the vgic when the state changes. Note that in theory this could have
been accomplished more simply by signalling the vgic every time the
state *potentially* changed, but we don't want to be hitting the vgic
more often than necessary.

Third, we get rid of the use of the map->active field in the vgic and
instead simply set the interrupt as active on the physical distributor
whenever the input to the GIC is asserted and conversely clear the
physical active state when the input to the GIC is deasserted.

Fourth, and finally, we now initialize the timer PPIs (and all the other
unused PPIs for now), to be level-triggered, and modify the sync code to
sample the line state on HW sync and re-inject a new interrupt if it is
still pending at that time.

Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>

+91 -108
+8 -3
arch/arm/kvm/arm.c
··· 561 561 562 562 if (ret <= 0 || need_new_vmid_gen(vcpu->kvm)) { 563 563 local_irq_enable(); 564 + kvm_timer_sync_hwstate(vcpu); 564 565 kvm_vgic_sync_hwstate(vcpu); 565 566 preempt_enable(); 566 - kvm_timer_sync_hwstate(vcpu); 567 567 continue; 568 568 } 569 569 ··· 608 608 kvm_guest_exit(); 609 609 trace_kvm_exit(kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu)); 610 610 611 + /* 612 + * We must sync the timer state before the vgic state so that 613 + * the vgic can properly sample the updated state of the 614 + * interrupt line. 615 + */ 616 + kvm_timer_sync_hwstate(vcpu); 617 + 611 618 kvm_vgic_sync_hwstate(vcpu); 612 619 613 620 preempt_enable(); 614 - 615 - kvm_timer_sync_hwstate(vcpu); 616 621 617 622 ret = handle_exit(vcpu, run, ret); 618 623 }
+1 -1
include/kvm/arm_arch_timer.h
··· 51 51 bool armed; 52 52 53 53 /* Timer IRQ */ 54 - const struct kvm_irq_level *irq; 54 + struct kvm_irq_level irq; 55 55 56 56 /* VGIC mapping */ 57 57 struct irq_phys_map *map;
-3
include/kvm/arm_vgic.h
··· 159 159 u32 virt_irq; 160 160 u32 phys_irq; 161 161 u32 irq; 162 - bool active; 163 162 }; 164 163 165 164 struct irq_phys_map_entry { ··· 353 354 struct irq_phys_map *kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, 354 355 int virt_irq, int irq); 355 356 int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, struct irq_phys_map *map); 356 - bool kvm_vgic_get_phys_irq_active(struct irq_phys_map *map); 357 - void kvm_vgic_set_phys_irq_active(struct irq_phys_map *map, bool active); 358 357 359 358 #define irqchip_in_kernel(k) (!!((k)->arch.vgic.in_kernel)) 360 359 #define vgic_initialized(k) (!!((k)->arch.vgic.nr_cpus))
+56 -25
virt/kvm/arm/arch_timer.c
··· 59 59 } 60 60 } 61 61 62 - static void kvm_timer_inject_irq(struct kvm_vcpu *vcpu) 63 - { 64 - int ret; 65 - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 66 - 67 - kvm_vgic_set_phys_irq_active(timer->map, true); 68 - ret = kvm_vgic_inject_mapped_irq(vcpu->kvm, vcpu->vcpu_id, 69 - timer->map, 70 - timer->irq->level); 71 - WARN_ON(ret); 72 - } 73 - 74 62 static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) 75 63 { 76 64 struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id; ··· 104 116 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 105 117 106 118 return !(timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) && 107 - (timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE) && 108 - !kvm_vgic_get_phys_irq_active(timer->map); 119 + (timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE); 109 120 } 110 121 111 122 bool kvm_timer_should_fire(struct kvm_vcpu *vcpu) ··· 119 132 now = kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff; 120 133 121 134 return cval <= now; 135 + } 136 + 137 + static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level) 138 + { 139 + int ret; 140 + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 141 + 142 + BUG_ON(!vgic_initialized(vcpu->kvm)); 143 + 144 + timer->irq.level = new_level; 145 + ret = kvm_vgic_inject_mapped_irq(vcpu->kvm, vcpu->vcpu_id, 146 + timer->map, 147 + timer->irq.level); 148 + WARN_ON(ret); 149 + } 150 + 151 + /* 152 + * Check if there was a change in the timer state (should we raise or lower 153 + * the line level to the GIC). 154 + */ 155 + static void kvm_timer_update_state(struct kvm_vcpu *vcpu) 156 + { 157 + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 158 + 159 + /* 160 + * If userspace modified the timer registers via SET_ONE_REG before 161 + * the vgic was initialized, we mustn't set the timer->irq.level value 162 + * because the guest would never see the interrupt. Instead wait 163 + * until we call this function from kvm_timer_flush_hwstate. 164 + */ 165 + if (!vgic_initialized(vcpu->kvm)) 166 + return; 167 + 168 + if (kvm_timer_should_fire(vcpu) != timer->irq.level) 169 + kvm_timer_update_irq(vcpu, !timer->irq.level); 122 170 } 123 171 124 172 /* ··· 214 192 bool phys_active; 215 193 int ret; 216 194 217 - if (kvm_timer_should_fire(vcpu)) 218 - kvm_timer_inject_irq(vcpu); 195 + kvm_timer_update_state(vcpu); 219 196 220 197 /* 221 - * We keep track of whether the edge-triggered interrupt has been 222 - * signalled to the vgic/guest, and if so, we mask the interrupt and 223 - * the physical distributor to prevent the timer from raising a 224 - * physical interrupt whenever we run a guest, preventing forward 225 - * VCPU progress. 198 + * If we enter the guest with the virtual input level to the VGIC 199 + * asserted, then we have already told the VGIC what we need to, and 200 + * we don't need to exit from the guest until the guest deactivates 201 + * the already injected interrupt, so therefore we should set the 202 + * hardware active state to prevent unnecessary exits from the guest. 203 + * 204 + * Conversely, if the virtual input level is deasserted, then always 205 + * clear the hardware active state to ensure that hardware interrupts 206 + * from the timer triggers a guest exit. 226 207 */ 227 - if (kvm_vgic_get_phys_irq_active(timer->map)) 208 + if (timer->irq.level) 228 209 phys_active = true; 229 210 else 230 211 phys_active = false; ··· 251 226 252 227 BUG_ON(timer_is_armed(timer)); 253 228 254 - if (kvm_timer_should_fire(vcpu)) 255 - kvm_timer_inject_irq(vcpu); 229 + /* 230 + * The guest could have modified the timer registers or the timer 231 + * could have expired, update the timer state. 232 + */ 233 + kvm_timer_update_state(vcpu); 256 234 } 257 235 258 236 int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu, ··· 270 242 * kvm_vcpu_set_target(). To handle this, we determine 271 243 * vcpu timer irq number when the vcpu is reset. 272 244 */ 273 - timer->irq = irq; 245 + timer->irq.irq = irq->irq; 274 246 275 247 /* 276 248 * The bits in CNTV_CTL are architecturally reset to UNKNOWN for ARMv8 ··· 279 251 * the ARMv7 architecture. 280 252 */ 281 253 timer->cntv_ctl = 0; 254 + kvm_timer_update_state(vcpu); 282 255 283 256 /* 284 257 * Tell the VGIC that the virtual interrupt is tied to a ··· 324 295 default: 325 296 return -1; 326 297 } 298 + 299 + kvm_timer_update_state(vcpu); 327 300 return 0; 328 301 } 329 302
+26 -76
virt/kvm/arm/vgic.c
··· 537 537 return false; 538 538 } 539 539 540 - /* 541 - * If a mapped interrupt's state has been modified by the guest such that it 542 - * is no longer active or pending, without it have gone through the sync path, 543 - * then the map->active field must be cleared so the interrupt can be taken 544 - * again. 545 - */ 546 - static void vgic_handle_clear_mapped_irq(struct kvm_vcpu *vcpu) 547 - { 548 - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; 549 - struct list_head *root; 550 - struct irq_phys_map_entry *entry; 551 - struct irq_phys_map *map; 552 - 553 - rcu_read_lock(); 554 - 555 - /* Check for PPIs */ 556 - root = &vgic_cpu->irq_phys_map_list; 557 - list_for_each_entry_rcu(entry, root, entry) { 558 - map = &entry->map; 559 - 560 - if (!vgic_dist_irq_is_pending(vcpu, map->virt_irq) && 561 - !vgic_irq_is_active(vcpu, map->virt_irq)) 562 - map->active = false; 563 - } 564 - 565 - rcu_read_unlock(); 566 - } 567 - 568 540 bool vgic_handle_clear_pending_reg(struct kvm *kvm, 569 541 struct kvm_exit_mmio *mmio, 570 542 phys_addr_t offset, int vcpu_id) ··· 567 595 vcpu_id, offset); 568 596 vgic_reg_access(mmio, reg, offset, mode); 569 597 570 - vgic_handle_clear_mapped_irq(kvm_get_vcpu(kvm, vcpu_id)); 571 598 vgic_update_state(kvm); 572 599 return true; 573 600 } ··· 604 633 ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT); 605 634 606 635 if (mmio->is_write) { 607 - vgic_handle_clear_mapped_irq(kvm_get_vcpu(kvm, vcpu_id)); 608 636 vgic_update_state(kvm); 609 637 return true; 610 638 } ··· 1413 1443 /* 1414 1444 * Save the physical active state, and reset it to inactive. 1415 1445 * 1416 - * Return 1 if HW interrupt went from active to inactive, and 0 otherwise. 1446 + * Return true if there's a pending level triggered interrupt line to queue. 1417 1447 */ 1418 - static int vgic_sync_hwirq(struct kvm_vcpu *vcpu, struct vgic_lr vlr) 1448 + static bool vgic_sync_hwirq(struct kvm_vcpu *vcpu, int lr, struct vgic_lr vlr) 1419 1449 { 1450 + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; 1420 1451 struct irq_phys_map *map; 1452 + bool phys_active; 1453 + bool level_pending; 1421 1454 int ret; 1422 1455 1423 1456 if (!(vlr.state & LR_HW)) 1424 - return 0; 1457 + return false; 1425 1458 1426 1459 map = vgic_irq_map_search(vcpu, vlr.irq); 1427 1460 BUG_ON(!map); 1428 1461 1429 1462 ret = irq_get_irqchip_state(map->irq, 1430 1463 IRQCHIP_STATE_ACTIVE, 1431 - &map->active); 1464 + &phys_active); 1432 1465 1433 1466 WARN_ON(ret); 1434 1467 1435 - if (map->active) 1468 + if (phys_active) 1436 1469 return 0; 1437 1470 1438 - return 1; 1471 + /* Mapped edge-triggered interrupts not yet supported. */ 1472 + WARN_ON(vgic_irq_is_edge(vcpu, vlr.irq)); 1473 + spin_lock(&dist->lock); 1474 + level_pending = process_level_irq(vcpu, lr, vlr); 1475 + spin_unlock(&dist->lock); 1476 + return level_pending; 1439 1477 } 1440 1478 1441 1479 /* Sync back the VGIC state after a guest run */ ··· 1468 1490 continue; 1469 1491 1470 1492 vlr = vgic_get_lr(vcpu, lr); 1471 - if (vgic_sync_hwirq(vcpu, vlr)) { 1472 - /* 1473 - * So this is a HW interrupt that the guest 1474 - * EOI-ed. Clean the LR state and allow the 1475 - * interrupt to be sampled again. 1476 - */ 1477 - vlr.state = 0; 1478 - vlr.hwirq = 0; 1479 - vgic_set_lr(vcpu, lr, vlr); 1480 - vgic_irq_clear_queued(vcpu, vlr.irq); 1481 - set_bit(lr, elrsr_ptr); 1482 - } 1493 + if (vgic_sync_hwirq(vcpu, lr, vlr)) 1494 + level_pending = true; 1483 1495 1484 1496 if (!test_bit(lr, elrsr_ptr)) 1485 1497 continue; ··· 1849 1881 } 1850 1882 1851 1883 /** 1852 - * kvm_vgic_get_phys_irq_active - Return the active state of a mapped IRQ 1853 - * 1854 - * Return the logical active state of a mapped interrupt. This doesn't 1855 - * necessarily reflects the current HW state. 1856 - */ 1857 - bool kvm_vgic_get_phys_irq_active(struct irq_phys_map *map) 1858 - { 1859 - BUG_ON(!map); 1860 - return map->active; 1861 - } 1862 - 1863 - /** 1864 - * kvm_vgic_set_phys_irq_active - Set the active state of a mapped IRQ 1865 - * 1866 - * Set the logical active state of a mapped interrupt. This doesn't 1867 - * immediately affects the HW state. 1868 - */ 1869 - void kvm_vgic_set_phys_irq_active(struct irq_phys_map *map, bool active) 1870 - { 1871 - BUG_ON(!map); 1872 - map->active = active; 1873 - } 1874 - 1875 - /** 1876 1884 * kvm_vgic_unmap_phys_irq - Remove a virtual to physical IRQ mapping 1877 1885 * @vcpu: The VCPU pointer 1878 1886 * @map: The pointer to a mapping obtained through kvm_vgic_map_phys_irq ··· 2073 2129 } 2074 2130 2075 2131 /* 2076 - * Enable all SGIs and configure all private IRQs as 2077 - * edge-triggered. 2132 + * Enable and configure all SGIs to be edge-triggere and 2133 + * configure all PPIs as level-triggered. 2078 2134 */ 2079 2135 for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) { 2080 - if (i < VGIC_NR_SGIS) 2136 + if (i < VGIC_NR_SGIS) { 2137 + /* SGIs */ 2081 2138 vgic_bitmap_set_irq_val(&dist->irq_enabled, 2082 2139 vcpu->vcpu_id, i, 1); 2083 - if (i < VGIC_NR_PRIVATE_IRQS) 2084 2140 vgic_bitmap_set_irq_val(&dist->irq_cfg, 2085 2141 vcpu->vcpu_id, i, 2086 2142 VGIC_CFG_EDGE); 2143 + } else if (i < VGIC_NR_PRIVATE_IRQS) { 2144 + /* PPIs */ 2145 + vgic_bitmap_set_irq_val(&dist->irq_cfg, 2146 + vcpu->vcpu_id, i, 2147 + VGIC_CFG_LEVEL); 2148 + } 2087 2149 } 2088 2150 2089 2151 vgic_enable(vcpu);