Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

KVM: PPC: Book3S HV: Improve handling of local vs. global TLB invalidations

When we change or remove a HPT (hashed page table) entry, we can do
either a global TLB invalidation (tlbie) that works across the whole
machine, or a local invalidation (tlbiel) that only affects this core.
Currently we do local invalidations if the VM has only one vcpu or if
the guest requests it with the H_LOCAL flag, though the guest Linux
kernel currently doesn't ever use H_LOCAL. Then, to cope with the
possibility that vcpus moving around to different physical cores might
expose stale TLB entries, there is some code in kvmppc_hv_entry to
flush the whole TLB of entries for this VM if either this vcpu is now
running on a different physical core from where it last ran, or if this
physical core last ran a different vcpu.

There are a number of problems on POWER7 with this as it stands:

- The TLB invalidation is done per thread, whereas it only needs to be
done per core, since the TLB is shared between the threads.
- With the possibility of the host paging out guest pages, the use of
H_LOCAL by an SMP guest is dangerous since the guest could possibly
retain and use a stale TLB entry pointing to a page that had been
removed from the guest.
- The TLB invalidations that we do when a vcpu moves from one physical
core to another are unnecessary in the case of an SMP guest that isn't
using H_LOCAL.
- The optimization of using local invalidations rather than global should
apply to guests with one virtual core, not just one vcpu.

(None of this applies on PPC970, since there we always have to
invalidate the whole TLB when entering and leaving the guest, and we
can't support paging out guest memory.)

To fix these problems and simplify the code, we now maintain a simple
cpumask of which cpus need to flush the TLB on entry to the guest.
(This is indexed by cpu, though we only ever use the bits for thread
0 of each core.) Whenever we do a local TLB invalidation, we set the
bits for every cpu except the bit for thread 0 of the core that we're
currently running on. Whenever we enter a guest, we test and clear the
bit for our core, and flush the TLB if it was set.

On initial startup of the VM, and when resetting the HPT, we set all the
bits in the need_tlb_flush cpumask, since any core could potentially have
stale TLB entries from the previous VM to use the same LPID, or the
previous contents of the HPT.

Then, we maintain a count of the number of online virtual cores, and use
that when deciding whether to use a local invalidation rather than the
number of online vcpus. The code to make that decision is extracted out
into a new function, global_invalidates(). For multi-core guests on
POWER7 (i.e. when we are using mmu notifiers), we now never do local
invalidations regardless of the H_LOCAL flag.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>

authored by

Paul Mackerras and committed by
Alexander Graf
1b400ba0 6a7f972d

+73 -45
+3 -2
arch/powerpc/include/asm/kvm_host.h
··· 246 246 int using_mmu_notifiers; 247 247 u32 hpt_order; 248 248 atomic_t vcpus_running; 249 + u32 online_vcores; 249 250 unsigned long hpt_npte; 250 251 unsigned long hpt_mask; 251 252 atomic_t hpte_mod_interest; 252 253 spinlock_t slot_phys_lock; 253 - unsigned short last_vcpu[NR_CPUS]; 254 + cpumask_t need_tlb_flush; 254 255 struct kvmppc_vcore *vcores[KVM_MAX_VCORES]; 255 256 struct kvmppc_linear_info *hpt_li; 256 257 #endif /* CONFIG_KVM_BOOK3S_64_HV */ ··· 276 275 int nap_count; 277 276 int napping_threads; 278 277 u16 pcpu; 278 + u16 last_cpu; 279 279 u8 vcore_state; 280 280 u8 in_guest; 281 281 struct list_head runnable_threads; ··· 525 523 u64 dec_jiffies; 526 524 u64 dec_expires; 527 525 unsigned long pending_exceptions; 528 - u16 last_cpu; 529 526 u8 ceded; 530 527 u8 prodded; 531 528 u32 last_inst;
+1 -3
arch/powerpc/kernel/asm-offsets.c
··· 441 441 DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr)); 442 442 DEFINE(KVM_HOST_SDR1, offsetof(struct kvm, arch.host_sdr1)); 443 443 DEFINE(KVM_TLBIE_LOCK, offsetof(struct kvm, arch.tlbie_lock)); 444 - DEFINE(KVM_ONLINE_CPUS, offsetof(struct kvm, online_vcpus.counter)); 445 - DEFINE(KVM_LAST_VCPU, offsetof(struct kvm, arch.last_vcpu)); 444 + DEFINE(KVM_NEED_FLUSH, offsetof(struct kvm, arch.need_tlb_flush.bits)); 446 445 DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr)); 447 446 DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor)); 448 447 DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v)); ··· 469 470 DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb)); 470 471 DEFINE(VCPU_SLB_MAX, offsetof(struct kvm_vcpu, arch.slb_max)); 471 472 DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr)); 472 - DEFINE(VCPU_LAST_CPU, offsetof(struct kvm_vcpu, arch.last_cpu)); 473 473 DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr)); 474 474 DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar)); 475 475 DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
+2 -5
arch/powerpc/kvm/book3s_64_mmu_hv.c
··· 148 148 * Reset all the reverse-mapping chains for all memslots 149 149 */ 150 150 kvmppc_rmap_reset(kvm); 151 - /* 152 - * Set the whole last_vcpu array to an invalid vcpu number. 153 - * This ensures that each vcpu will flush its TLB on next entry. 154 - */ 155 - memset(kvm->arch.last_vcpu, 0xff, sizeof(kvm->arch.last_vcpu)); 151 + /* Ensure that each vcpu will flush its TLB on next entry. */ 152 + cpumask_setall(&kvm->arch.need_tlb_flush); 156 153 *htab_orderp = order; 157 154 err = 0; 158 155 } else {
+8 -1
arch/powerpc/kvm/book3s_hv.c
··· 853 853 goto free_vcpu; 854 854 855 855 vcpu->arch.shared = &vcpu->arch.shregs; 856 - vcpu->arch.last_cpu = -1; 857 856 vcpu->arch.mmcr[0] = MMCR0_FC; 858 857 vcpu->arch.ctrl = CTRL_RUNLATCH; 859 858 /* default to host PVR, since we can't spoof it */ ··· 879 880 vcore->preempt_tb = TB_NIL; 880 881 } 881 882 kvm->arch.vcores[core] = vcore; 883 + kvm->arch.online_vcores++; 882 884 } 883 885 mutex_unlock(&kvm->lock); 884 886 ··· 1801 1801 if (lpid < 0) 1802 1802 return -ENOMEM; 1803 1803 kvm->arch.lpid = lpid; 1804 + 1805 + /* 1806 + * Since we don't flush the TLB when tearing down a VM, 1807 + * and this lpid might have previously been used, 1808 + * make sure we flush on each core before running the new VM. 1809 + */ 1810 + cpumask_setall(&kvm->arch.need_tlb_flush); 1804 1811 1805 1812 INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); 1806 1813
+33 -4
arch/powerpc/kvm/book3s_hv_rm_mmu.c
··· 35 35 return __va(addr); 36 36 } 37 37 38 + /* Return 1 if we need to do a global tlbie, 0 if we can use tlbiel */ 39 + static int global_invalidates(struct kvm *kvm, unsigned long flags) 40 + { 41 + int global; 42 + 43 + /* 44 + * If there is only one vcore, and it's currently running, 45 + * we can use tlbiel as long as we mark all other physical 46 + * cores as potentially having stale TLB entries for this lpid. 47 + * If we're not using MMU notifiers, we never take pages away 48 + * from the guest, so we can use tlbiel if requested. 49 + * Otherwise, don't use tlbiel. 50 + */ 51 + if (kvm->arch.online_vcores == 1 && local_paca->kvm_hstate.kvm_vcore) 52 + global = 0; 53 + else if (kvm->arch.using_mmu_notifiers) 54 + global = 1; 55 + else 56 + global = !(flags & H_LOCAL); 57 + 58 + if (!global) { 59 + /* any other core might now have stale TLB entries... */ 60 + smp_wmb(); 61 + cpumask_setall(&kvm->arch.need_tlb_flush); 62 + cpumask_clear_cpu(local_paca->kvm_hstate.kvm_vcore->pcpu, 63 + &kvm->arch.need_tlb_flush); 64 + } 65 + 66 + return global; 67 + } 68 + 38 69 /* 39 70 * Add this HPTE into the chain for the real page. 40 71 * Must be called with the chain locked; it unlocks the chain. ··· 421 390 if (v & HPTE_V_VALID) { 422 391 hpte[0] &= ~HPTE_V_VALID; 423 392 rb = compute_tlbie_rb(v, hpte[1], pte_index); 424 - if (!(flags & H_LOCAL) && atomic_read(&kvm->online_vcpus) > 1) { 393 + if (global_invalidates(kvm, flags)) { 425 394 while (!try_lock_tlbie(&kvm->arch.tlbie_lock)) 426 395 cpu_relax(); 427 396 asm volatile("ptesync" : : : "memory"); ··· 596 565 return H_NOT_FOUND; 597 566 } 598 567 599 - if (atomic_read(&kvm->online_vcpus) == 1) 600 - flags |= H_LOCAL; 601 568 v = hpte[0]; 602 569 bits = (flags << 55) & HPTE_R_PP0; 603 570 bits |= (flags << 48) & HPTE_R_KEY_HI; ··· 616 587 if (v & HPTE_V_VALID) { 617 588 rb = compute_tlbie_rb(v, r, pte_index); 618 589 hpte[0] = v & ~HPTE_V_VALID; 619 - if (!(flags & H_LOCAL)) { 590 + if (global_invalidates(kvm, flags)) { 620 591 while(!try_lock_tlbie(&kvm->arch.tlbie_lock)) 621 592 cpu_relax(); 622 593 asm volatile("ptesync" : : : "memory");
+26 -30
arch/powerpc/kvm/book3s_hv_rmhandlers.S
··· 313 313 mtspr SPRN_SDR1,r6 /* switch to partition page table */ 314 314 mtspr SPRN_LPID,r7 315 315 isync 316 + 317 + /* See if we need to flush the TLB */ 318 + lhz r6,PACAPACAINDEX(r13) /* test_bit(cpu, need_tlb_flush) */ 319 + clrldi r7,r6,64-6 /* extract bit number (6 bits) */ 320 + srdi r6,r6,6 /* doubleword number */ 321 + sldi r6,r6,3 /* address offset */ 322 + add r6,r6,r9 323 + addi r6,r6,KVM_NEED_FLUSH /* dword in kvm->arch.need_tlb_flush */ 316 324 li r0,1 325 + sld r0,r0,r7 326 + ld r7,0(r6) 327 + and. r7,r7,r0 328 + beq 22f 329 + 23: ldarx r7,0,r6 /* if set, clear the bit */ 330 + andc r7,r7,r0 331 + stdcx. r7,0,r6 332 + bne 23b 333 + li r6,128 /* and flush the TLB */ 334 + mtctr r6 335 + li r7,0x800 /* IS field = 0b10 */ 336 + ptesync 337 + 28: tlbiel r7 338 + addi r7,r7,0x1000 339 + bdnz 28b 340 + ptesync 341 + 342 + 22: li r0,1 317 343 stb r0,VCORE_IN_GUEST(r5) /* signal secondaries to continue */ 318 344 b 10f 319 345 ··· 361 335 li r12,BOOK3S_INTERRUPT_HV_DECREMENTER 362 336 mr r9,r4 363 337 blt hdec_soon 364 - 365 - /* 366 - * Invalidate the TLB if we could possibly have stale TLB 367 - * entries for this partition on this core due to the use 368 - * of tlbiel. 369 - * XXX maybe only need this on primary thread? 370 - */ 371 - ld r9,VCPU_KVM(r4) /* pointer to struct kvm */ 372 - lwz r5,VCPU_VCPUID(r4) 373 - lhz r6,PACAPACAINDEX(r13) 374 - rldimi r6,r5,0,62 /* XXX map as if threads 1:1 p:v */ 375 - lhz r8,VCPU_LAST_CPU(r4) 376 - sldi r7,r6,1 /* see if this is the same vcpu */ 377 - add r7,r7,r9 /* as last ran on this pcpu */ 378 - lhz r0,KVM_LAST_VCPU(r7) 379 - cmpw r6,r8 /* on the same cpu core as last time? */ 380 - bne 3f 381 - cmpw r0,r5 /* same vcpu as this core last ran? */ 382 - beq 1f 383 - 3: sth r6,VCPU_LAST_CPU(r4) /* if not, invalidate partition TLB */ 384 - sth r5,KVM_LAST_VCPU(r7) 385 - li r6,128 386 - mtctr r6 387 - li r7,0x800 /* IS field = 0b10 */ 388 - ptesync 389 - 2: tlbiel r7 390 - addi r7,r7,0x1000 391 - bdnz 2b 392 - ptesync 393 - 1: 394 338 395 339 /* Save purr/spurr */ 396 340 mfspr r5,SPRN_PURR