Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

KVM: PPC: Book3S HV: Improve handling of debug-trigger HMIs on POWER9

Hypervisor maintenance interrupts (HMIs) are generated by various
causes, signalled by bits in the hypervisor maintenance exception
register (HMER). In most cases calling OPAL to handle the interrupt
is the correct thing to do, but the "debug trigger" HMIs signalled by
PPC bit 17 (bit 46) of HMER are used to invoke software workarounds
for hardware bugs, and OPAL does not have any code to handle this
cause. The debug trigger HMI is used in POWER9 DD2.0 and DD2.1 chips
to work around a hardware bug in executing vector load instructions to
cache inhibited memory. In POWER9 DD2.2 chips, it is generated when
conditions are detected relating to threads being in TM (transactional
memory) suspended mode when the core SMT configuration needs to be
reconfigured.

The kernel currently has code to detect the vector CI load condition,
but only when the HMI occurs in the host, not when it occurs in a
guest. If a HMI occurs in the guest, it is always passed to OPAL, and
then we always re-sync the timebase, because the HMI cause might have
been a timebase error, for which OPAL would re-sync the timebase, thus
removing the timebase offset which KVM applied for the guest. Since
we don't know what OPAL did, we don't know whether to subtract the
timebase offset from the timebase, so instead we re-sync the timebase.

This adds code to determine explicitly what the cause of a debug
trigger HMI will be. This is based on a new device-tree property
under the CPU nodes called ibm,hmi-special-triggers, if it is
present, or otherwise based on the PVR (processor version register).
The handling of debug trigger HMIs is pulled out into a separate
function which can be called from the KVM guest exit code. If this
function handles and clears the HMI, and no other HMI causes remain,
then we skip calling OPAL and we proceed to subtract the guest
timebase offset from the timebase.

The overall handling for HMIs that occur in the host (i.e. not in a
KVM guest) is largely unchanged, except that we now don't set the flag
for the vector CI load workaround on DD2.2 processors.

This also removes a BUG_ON in the KVM code. BUG_ON is generally not
useful in KVM guest entry/exit code since it is difficult to handle
the resulting trap gracefully.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

authored by

Paul Mackerras and committed by
Michael Ellerman
d075745d 7f1c410d

+131 -37
+4
arch/powerpc/include/asm/hmi.h
··· 42 42 static inline void wait_for_subcore_guest_exit(void) { } 43 43 static inline void wait_for_tb_resync(void) { } 44 44 #endif 45 + 46 + struct pt_regs; 47 + extern long hmi_handle_debugtrig(struct pt_regs *regs); 48 + 45 49 #endif /* __ASM_PPC64_HMI_H__ */
+3 -2
arch/powerpc/include/asm/reg.h
··· 432 432 #define SPRN_LPID 0x13F /* Logical Partition Identifier */ 433 433 #endif 434 434 #define LPID_RSVD 0x3ff /* Reserved LPID for partn switching */ 435 - #define SPRN_HMER 0x150 /* Hardware m? error recovery */ 436 - #define SPRN_HMEER 0x151 /* Hardware m? enable error recovery */ 435 + #define SPRN_HMER 0x150 /* Hypervisor maintenance exception reg */ 436 + #define HMER_DEBUG_TRIG (1ul << (63 - 17)) /* Debug trigger */ 437 + #define SPRN_HMEER 0x151 /* Hyp maintenance exception enable reg */ 437 438 #define SPRN_PCR 0x152 /* Processor compatibility register */ 438 439 #define PCR_VEC_DIS (1ul << (63-0)) /* Vec. disable (bit NA since POWER8) */ 439 440 #define PCR_VSX_DIS (1ul << (63-1)) /* VSX disable (bit NA since POWER8) */
+114 -28
arch/powerpc/kernel/mce.c
··· 495 495 return handled; 496 496 } 497 497 498 - long hmi_exception_realmode(struct pt_regs *regs) 498 + /* Possible meanings for HMER_DEBUG_TRIG bit being set on POWER9 */ 499 + static enum { 500 + DTRIG_UNKNOWN, 501 + DTRIG_VECTOR_CI, /* need to emulate vector CI load instr */ 502 + DTRIG_SUSPEND_ESCAPE, /* need to escape from TM suspend mode */ 503 + } hmer_debug_trig_function; 504 + 505 + static int init_debug_trig_function(void) 499 506 { 507 + int pvr; 508 + struct device_node *cpun; 509 + struct property *prop = NULL; 510 + const char *str; 511 + 512 + /* First look in the device tree */ 513 + preempt_disable(); 514 + cpun = of_get_cpu_node(smp_processor_id(), NULL); 515 + if (cpun) { 516 + of_property_for_each_string(cpun, "ibm,hmi-special-triggers", 517 + prop, str) { 518 + if (strcmp(str, "bit17-vector-ci-load") == 0) 519 + hmer_debug_trig_function = DTRIG_VECTOR_CI; 520 + else if (strcmp(str, "bit17-tm-suspend-escape") == 0) 521 + hmer_debug_trig_function = DTRIG_SUSPEND_ESCAPE; 522 + } 523 + of_node_put(cpun); 524 + } 525 + preempt_enable(); 526 + 527 + /* If we found the property, don't look at PVR */ 528 + if (prop) 529 + goto out; 530 + 531 + pvr = mfspr(SPRN_PVR); 532 + /* Check for POWER9 Nimbus (scale-out) */ 533 + if ((PVR_VER(pvr) == PVR_POWER9) && (pvr & 0xe000) == 0) { 534 + /* DD2.2 and later */ 535 + if ((pvr & 0xfff) >= 0x202) 536 + hmer_debug_trig_function = DTRIG_SUSPEND_ESCAPE; 537 + /* DD2.0 and DD2.1 - used for vector CI load emulation */ 538 + else if ((pvr & 0xfff) >= 0x200) 539 + hmer_debug_trig_function = DTRIG_VECTOR_CI; 540 + } 541 + 542 + out: 543 + switch (hmer_debug_trig_function) { 544 + case DTRIG_VECTOR_CI: 545 + pr_debug("HMI debug trigger used for vector CI load\n"); 546 + break; 547 + case DTRIG_SUSPEND_ESCAPE: 548 + pr_debug("HMI debug trigger used for TM suspend escape\n"); 549 + break; 550 + default: 551 + break; 552 + } 553 + return 0; 554 + } 555 + __initcall(init_debug_trig_function); 556 + 557 + /* 558 + * Handle HMIs that occur as a result of a debug trigger. 559 + * Return values: 560 + * -1 means this is not a HMI cause that we know about 561 + * 0 means no further handling is required 562 + * 1 means further handling is required 563 + */ 564 + long hmi_handle_debugtrig(struct pt_regs *regs) 565 + { 566 + unsigned long hmer = mfspr(SPRN_HMER); 567 + long ret = 0; 568 + 569 + /* HMER_DEBUG_TRIG bit is used for various workarounds on P9 */ 570 + if (!((hmer & HMER_DEBUG_TRIG) 571 + && hmer_debug_trig_function != DTRIG_UNKNOWN)) 572 + return -1; 573 + 574 + hmer &= ~HMER_DEBUG_TRIG; 575 + /* HMER is a write-AND register */ 576 + mtspr(SPRN_HMER, ~HMER_DEBUG_TRIG); 577 + 578 + switch (hmer_debug_trig_function) { 579 + case DTRIG_VECTOR_CI: 580 + /* 581 + * Now to avoid problems with soft-disable we 582 + * only do the emulation if we are coming from 583 + * host user space 584 + */ 585 + if (regs && user_mode(regs)) 586 + ret = local_paca->hmi_p9_special_emu = 1; 587 + 588 + break; 589 + 590 + default: 591 + break; 592 + } 593 + 594 + /* 595 + * See if any other HMI causes remain to be handled 596 + */ 597 + if (hmer & mfspr(SPRN_HMEER)) 598 + return -1; 599 + 600 + return ret; 601 + } 602 + 603 + /* 604 + * Return values: 605 + */ 606 + long hmi_exception_realmode(struct pt_regs *regs) 607 + { 608 + int ret; 609 + 500 610 __this_cpu_inc(irq_stat.hmi_exceptions); 501 611 502 - #ifdef CONFIG_PPC_BOOK3S_64 503 - /* Workaround for P9 vector CI loads (see p9_hmi_special_emu) */ 504 - if (pvr_version_is(PVR_POWER9)) { 505 - unsigned long hmer = mfspr(SPRN_HMER); 506 - 507 - /* Do we have the debug bit set */ 508 - if (hmer & PPC_BIT(17)) { 509 - hmer &= ~PPC_BIT(17); 510 - mtspr(SPRN_HMER, hmer); 511 - 512 - /* 513 - * Now to avoid problems with soft-disable we 514 - * only do the emulation if we are coming from 515 - * user space 516 - */ 517 - if (user_mode(regs)) 518 - local_paca->hmi_p9_special_emu = 1; 519 - 520 - /* 521 - * Don't bother going to OPAL if that's the 522 - * only relevant bit. 523 - */ 524 - if (!(hmer & mfspr(SPRN_HMEER))) 525 - return local_paca->hmi_p9_special_emu; 526 - } 527 - } 528 - #endif /* CONFIG_PPC_BOOK3S_64 */ 612 + ret = hmi_handle_debugtrig(regs); 613 + if (ret >= 0) 614 + return ret; 529 615 530 616 wait_for_subcore_guest_exit(); 531 617
+5 -3
arch/powerpc/kvm/book3s_hv_ras.c
··· 268 268 * secondary threads to proceed. 269 269 * - All secondary threads will eventually call opal hmi handler on 270 270 * their exit path. 271 + * 272 + * Returns 1 if the timebase offset should be applied, 0 if not. 271 273 */ 272 274 273 275 long kvmppc_realmode_hmi_handler(void) 274 276 { 275 - int ptid = local_paca->kvm_hstate.ptid; 276 277 bool resync_req; 277 278 278 - /* This is only called on primary thread. */ 279 - BUG_ON(ptid != 0); 280 279 __this_cpu_inc(irq_stat.hmi_exceptions); 280 + 281 + if (hmi_handle_debugtrig(NULL) >= 0) 282 + return 1; 281 283 282 284 /* 283 285 * By now primary thread has already completed guest->host
+5 -4
arch/powerpc/kvm/book3s_hv_rmhandlers.S
··· 1909 1909 bne 27f 1910 1910 bl kvmppc_realmode_hmi_handler 1911 1911 nop 1912 + cmpdi r3, 0 1912 1913 li r12, BOOK3S_INTERRUPT_HMI 1913 1914 /* 1914 - * At this point kvmppc_realmode_hmi_handler would have resync-ed 1915 - * the TB. Hence it is not required to subtract guest timebase 1916 - * offset from timebase. So, skip it. 1915 + * At this point kvmppc_realmode_hmi_handler may have resync-ed 1916 + * the TB, and if it has, we must not subtract the guest timebase 1917 + * offset from the timebase. So, skip it. 1917 1918 * 1918 1919 * Also, do not call kvmppc_subcore_exit_guest() because it has 1919 1920 * been invoked as part of kvmppc_realmode_hmi_handler(). 1920 1921 */ 1921 - b 30f 1922 + beq 30f 1922 1923 1923 1924 27: 1924 1925 /* Subtract timebase offset from timebase */