Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

KVM: PPC: Book3S HV: Handle guest-caused machine checks on POWER7 without panicking

Currently, if a machine check interrupt happens while we are in the
guest, we exit the guest and call the host's machine check handler,
which tends to cause the host to panic. Some machine checks can be
triggered by the guest; for example, if the guest creates two entries
in the SLB that map the same effective address, and then accesses that
effective address, the CPU will take a machine check interrupt.

To handle this better, when a machine check happens inside the guest,
we call a new function, kvmppc_realmode_machine_check(), while still in
real mode before exiting the guest. On POWER7, it handles the cases
that the guest can trigger, either by flushing and reloading the SLB,
or by flushing the TLB, and then it delivers the machine check interrupt
directly to the guest without going back to the host. On POWER7, the
OPAL firmware patches the machine check interrupt vector so that it
gets control first, and it leaves behind its analysis of the situation
in a structure pointed to by the opal_mc_evt field of the paca. The
kvmppc_realmode_machine_check() function looks at this, and if OPAL
reports that there was no error, or that it has handled the error, we
also go straight back to the guest with a machine check. We have to
deliver a machine check to the guest since the machine check interrupt
might have trashed valid values in SRR0/1.

If the machine check is one we can't handle in real mode, and one that
OPAL hasn't already handled, or on PPC970, we exit the guest and call
the host's machine check handler. We do this by jumping to the
machine_check_fwnmi label, rather than absolute address 0x200, because
we don't want to re-execute OPAL's handler on POWER7. On PPC970, the
two are equivalent because address 0x200 just contains a branch.

Then, if the host machine check handler decides that the system can
continue executing, kvmppc_handle_exit() delivers a machine check
interrupt to the guest -- once again to let the guest know that SRR0/1
have been modified.

Signed-off-by: Paul Mackerras <paulus@samba.org>
[agraf: fix checkpatch warnings]
Signed-off-by: Alexander Graf <agraf@suse.de>

authored by

Paul Mackerras and committed by
Alexander Graf
b4072df4 1b400ba0

+213 -28
+10
arch/powerpc/include/asm/mmu-hash64.h
··· 121 121 #define PP_RXRX 3 /* Supervisor read, User read */ 122 122 #define PP_RXXX (HPTE_R_PP0 | 2) /* Supervisor read, user none */ 123 123 124 + /* Fields for tlbiel instruction in architecture 2.06 */ 125 + #define TLBIEL_INVAL_SEL_MASK 0xc00 /* invalidation selector */ 126 + #define TLBIEL_INVAL_PAGE 0x000 /* invalidate a single page */ 127 + #define TLBIEL_INVAL_SET_LPID 0x800 /* invalidate a set for current LPID */ 128 + #define TLBIEL_INVAL_SET 0xc00 /* invalidate a set for all LPIDs */ 129 + #define TLBIEL_INVAL_SET_MASK 0xfff000 /* set number to inval. */ 130 + #define TLBIEL_INVAL_SET_SHIFT 12 131 + 132 + #define POWER7_TLB_SETS 128 /* # sets in POWER7 TLB */ 133 + 124 134 #ifndef __ASSEMBLY__ 125 135 126 136 struct hash_pte {
+1
arch/powerpc/kvm/Makefile
··· 73 73 book3s_hv_rmhandlers.o \ 74 74 book3s_hv_rm_mmu.o \ 75 75 book3s_64_vio_hv.o \ 76 + book3s_hv_ras.o \ 76 77 book3s_hv_builtin.o 77 78 78 79 kvm-book3s_64-module-objs := \
+11
arch/powerpc/kvm/book3s_hv.c
··· 545 545 case BOOK3S_INTERRUPT_PERFMON: 546 546 r = RESUME_GUEST; 547 547 break; 548 + case BOOK3S_INTERRUPT_MACHINE_CHECK: 549 + /* 550 + * Deliver a machine check interrupt to the guest. 551 + * We have to do this, even if the host has handled the 552 + * machine check, because machine checks use SRR0/1 and 553 + * the interrupt might have trashed guest state in them. 554 + */ 555 + kvmppc_book3s_queue_irqprio(vcpu, 556 + BOOK3S_INTERRUPT_MACHINE_CHECK); 557 + r = RESUME_GUEST; 558 + break; 548 559 case BOOK3S_INTERRUPT_PROGRAM: 549 560 { 550 561 ulong flags;
+144
arch/powerpc/kvm/book3s_hv_ras.c
··· 1 + /* 2 + * This program is free software; you can redistribute it and/or modify 3 + * it under the terms of the GNU General Public License, version 2, as 4 + * published by the Free Software Foundation. 5 + * 6 + * Copyright 2012 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 7 + */ 8 + 9 + #include <linux/types.h> 10 + #include <linux/string.h> 11 + #include <linux/kvm.h> 12 + #include <linux/kvm_host.h> 13 + #include <linux/kernel.h> 14 + #include <asm/opal.h> 15 + 16 + /* SRR1 bits for machine check on POWER7 */ 17 + #define SRR1_MC_LDSTERR (1ul << (63-42)) 18 + #define SRR1_MC_IFETCH_SH (63-45) 19 + #define SRR1_MC_IFETCH_MASK 0x7 20 + #define SRR1_MC_IFETCH_SLBPAR 2 /* SLB parity error */ 21 + #define SRR1_MC_IFETCH_SLBMULTI 3 /* SLB multi-hit */ 22 + #define SRR1_MC_IFETCH_SLBPARMULTI 4 /* SLB parity + multi-hit */ 23 + #define SRR1_MC_IFETCH_TLBMULTI 5 /* I-TLB multi-hit */ 24 + 25 + /* DSISR bits for machine check on POWER7 */ 26 + #define DSISR_MC_DERAT_MULTI 0x800 /* D-ERAT multi-hit */ 27 + #define DSISR_MC_TLB_MULTI 0x400 /* D-TLB multi-hit */ 28 + #define DSISR_MC_SLB_PARITY 0x100 /* SLB parity error */ 29 + #define DSISR_MC_SLB_MULTI 0x080 /* SLB multi-hit */ 30 + #define DSISR_MC_SLB_PARMULTI 0x040 /* SLB parity + multi-hit */ 31 + 32 + /* POWER7 SLB flush and reload */ 33 + static void reload_slb(struct kvm_vcpu *vcpu) 34 + { 35 + struct slb_shadow *slb; 36 + unsigned long i, n; 37 + 38 + /* First clear out SLB */ 39 + asm volatile("slbmte %0,%0; slbia" : : "r" (0)); 40 + 41 + /* Do they have an SLB shadow buffer registered? */ 42 + slb = vcpu->arch.slb_shadow.pinned_addr; 43 + if (!slb) 44 + return; 45 + 46 + /* Sanity check */ 47 + n = min_t(u32, slb->persistent, SLB_MIN_SIZE); 48 + if ((void *) &slb->save_area[n] > vcpu->arch.slb_shadow.pinned_end) 49 + return; 50 + 51 + /* Load up the SLB from that */ 52 + for (i = 0; i < n; ++i) { 53 + unsigned long rb = slb->save_area[i].esid; 54 + unsigned long rs = slb->save_area[i].vsid; 55 + 56 + rb = (rb & ~0xFFFul) | i; /* insert entry number */ 57 + asm volatile("slbmte %0,%1" : : "r" (rs), "r" (rb)); 58 + } 59 + } 60 + 61 + /* POWER7 TLB flush */ 62 + static void flush_tlb_power7(struct kvm_vcpu *vcpu) 63 + { 64 + unsigned long i, rb; 65 + 66 + rb = TLBIEL_INVAL_SET_LPID; 67 + for (i = 0; i < POWER7_TLB_SETS; ++i) { 68 + asm volatile("tlbiel %0" : : "r" (rb)); 69 + rb += 1 << TLBIEL_INVAL_SET_SHIFT; 70 + } 71 + } 72 + 73 + /* 74 + * On POWER7, see if we can handle a machine check that occurred inside 75 + * the guest in real mode, without switching to the host partition. 76 + * 77 + * Returns: 0 => exit guest, 1 => deliver machine check to guest 78 + */ 79 + static long kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu) 80 + { 81 + unsigned long srr1 = vcpu->arch.shregs.msr; 82 + struct opal_machine_check_event *opal_evt; 83 + long handled = 1; 84 + 85 + if (srr1 & SRR1_MC_LDSTERR) { 86 + /* error on load/store */ 87 + unsigned long dsisr = vcpu->arch.shregs.dsisr; 88 + 89 + if (dsisr & (DSISR_MC_SLB_PARMULTI | DSISR_MC_SLB_MULTI | 90 + DSISR_MC_SLB_PARITY | DSISR_MC_DERAT_MULTI)) { 91 + /* flush and reload SLB; flushes D-ERAT too */ 92 + reload_slb(vcpu); 93 + dsisr &= ~(DSISR_MC_SLB_PARMULTI | DSISR_MC_SLB_MULTI | 94 + DSISR_MC_SLB_PARITY | DSISR_MC_DERAT_MULTI); 95 + } 96 + if (dsisr & DSISR_MC_TLB_MULTI) { 97 + flush_tlb_power7(vcpu); 98 + dsisr &= ~DSISR_MC_TLB_MULTI; 99 + } 100 + /* Any other errors we don't understand? */ 101 + if (dsisr & 0xffffffffUL) 102 + handled = 0; 103 + } 104 + 105 + switch ((srr1 >> SRR1_MC_IFETCH_SH) & SRR1_MC_IFETCH_MASK) { 106 + case 0: 107 + break; 108 + case SRR1_MC_IFETCH_SLBPAR: 109 + case SRR1_MC_IFETCH_SLBMULTI: 110 + case SRR1_MC_IFETCH_SLBPARMULTI: 111 + reload_slb(vcpu); 112 + break; 113 + case SRR1_MC_IFETCH_TLBMULTI: 114 + flush_tlb_power7(vcpu); 115 + break; 116 + default: 117 + handled = 0; 118 + } 119 + 120 + /* 121 + * See if OPAL has already handled the condition. 122 + * We assume that if the condition is recovered then OPAL 123 + * will have generated an error log event that we will pick 124 + * up and log later. 125 + */ 126 + opal_evt = local_paca->opal_mc_evt; 127 + if (opal_evt->version == OpalMCE_V1 && 128 + (opal_evt->severity == OpalMCE_SEV_NO_ERROR || 129 + opal_evt->disposition == OpalMCE_DISPOSITION_RECOVERED)) 130 + handled = 1; 131 + 132 + if (handled) 133 + opal_evt->in_use = 0; 134 + 135 + return handled; 136 + } 137 + 138 + long kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu) 139 + { 140 + if (cpu_has_feature(CPU_FTR_ARCH_206)) 141 + return kvmppc_realmode_mc_power7(vcpu); 142 + 143 + return 0; 144 + }
+47 -28
arch/powerpc/kvm/book3s_hv_rmhandlers.S
··· 27 27 #include <asm/asm-offsets.h> 28 28 #include <asm/exception-64s.h> 29 29 #include <asm/kvm_book3s_asm.h> 30 + #include <asm/mmu-hash64.h> 30 31 31 32 /***************************************************************************** 32 33 * * ··· 679 678 1: 680 679 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) 681 680 682 - nohpte_cont: 683 - hcall_real_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ 681 + guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ 684 682 /* Save DEC */ 685 683 mfspr r5,SPRN_DEC 686 684 mftb r6 ··· 699 699 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) 700 700 std r6, VCPU_FAULT_DAR(r9) 701 701 stw r7, VCPU_FAULT_DSISR(r9) 702 + 703 + /* See if it is a machine check */ 704 + cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK 705 + beq machine_check_realmode 706 + mc_cont: 702 707 703 708 /* Save guest CTRL register, set runlatch to 1 */ 704 709 6: mfspr r6,SPRN_CTRLF ··· 1117 1112 /* 1118 1113 * For external and machine check interrupts, we need 1119 1114 * to call the Linux handler to process the interrupt. 1120 - * We do that by jumping to the interrupt vector address 1121 - * which we have in r12. The [h]rfid at the end of the 1115 + * We do that by jumping to absolute address 0x500 for 1116 + * external interrupts, or the machine_check_fwnmi label 1117 + * for machine checks (since firmware might have patched 1118 + * the vector area at 0x200). The [h]rfid at the end of the 1122 1119 * handler will return to the book3s_hv_interrupts.S code. 1123 1120 * For other interrupts we do the rfid to get back 1124 - * to the book3s_interrupts.S code here. 1121 + * to the book3s_hv_interrupts.S code here. 1125 1122 */ 1126 1123 ld r8, HSTATE_VMHANDLER(r13) 1127 1124 ld r7, HSTATE_HOST_MSR(r13) 1128 1125 1126 + cmpwi cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK 1129 1127 cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL 1128 + BEGIN_FTR_SECTION 1130 1129 beq 11f 1131 - cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK 1130 + END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) 1132 1131 1133 1132 /* RFI into the highmem handler, or branch to interrupt handler */ 1134 - 12: mfmsr r6 1135 - mtctr r12 1133 + mfmsr r6 1136 1134 li r0, MSR_RI 1137 1135 andc r6, r6, r0 1138 1136 mtmsrd r6, 1 /* Clear RI in MSR */ 1139 1137 mtsrr0 r8 1140 1138 mtsrr1 r7 1141 - beqctr 1139 + beqa 0x500 /* external interrupt (PPC970) */ 1140 + beq cr1, 13f /* machine check */ 1142 1141 RFI 1143 1142 1144 - 11: 1145 - BEGIN_FTR_SECTION 1146 - b 12b 1147 - END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) 1148 - mtspr SPRN_HSRR0, r8 1143 + /* On POWER7, we have external interrupts set to use HSRR0/1 */ 1144 + 11: mtspr SPRN_HSRR0, r8 1149 1145 mtspr SPRN_HSRR1, r7 1150 1146 ba 0x500 1147 + 1148 + 13: b machine_check_fwnmi 1151 1149 1152 1150 /* 1153 1151 * Check whether an HDSI is an HPTE not found fault or something else. ··· 1184 1176 cmpdi r3, 0 /* retry the instruction */ 1185 1177 beq 6f 1186 1178 cmpdi r3, -1 /* handle in kernel mode */ 1187 - beq nohpte_cont 1179 + beq guest_exit_cont 1188 1180 cmpdi r3, -2 /* MMIO emulation; need instr word */ 1189 1181 beq 2f 1190 1182 ··· 1198 1190 li r10, BOOK3S_INTERRUPT_DATA_STORAGE 1199 1191 li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ 1200 1192 rotldi r11, r11, 63 1193 + fast_interrupt_c_return: 1201 1194 6: ld r7, VCPU_CTR(r9) 1202 1195 lwz r8, VCPU_XER(r9) 1203 1196 mtctr r7 ··· 1231 1222 /* Unset guest mode. */ 1232 1223 li r0, KVM_GUEST_MODE_NONE 1233 1224 stb r0, HSTATE_IN_GUEST(r13) 1234 - b nohpte_cont 1225 + b guest_exit_cont 1235 1226 1236 1227 /* 1237 1228 * Similarly for an HISI, reflect it to the guest as an ISI unless ··· 1257 1248 ld r11, VCPU_MSR(r9) 1258 1249 li r12, BOOK3S_INTERRUPT_H_INST_STORAGE 1259 1250 cmpdi r3, 0 /* retry the instruction */ 1260 - beq 6f 1251 + beq fast_interrupt_c_return 1261 1252 cmpdi r3, -1 /* handle in kernel mode */ 1262 - beq nohpte_cont 1253 + beq guest_exit_cont 1263 1254 1264 1255 /* Synthesize an ISI for the guest */ 1265 1256 mr r11, r3 ··· 1268 1259 li r10, BOOK3S_INTERRUPT_INST_STORAGE 1269 1260 li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ 1270 1261 rotldi r11, r11, 63 1271 - 6: ld r7, VCPU_CTR(r9) 1272 - lwz r8, VCPU_XER(r9) 1273 - mtctr r7 1274 - mtxer r8 1275 - mr r4, r9 1276 - b fast_guest_return 1262 + b fast_interrupt_c_return 1277 1263 1278 1264 3: ld r6, VCPU_KVM(r9) /* not relocated, use VRMA */ 1279 1265 ld r5, KVM_VRMA_SLB_V(r6) ··· 1284 1280 hcall_try_real_mode: 1285 1281 ld r3,VCPU_GPR(R3)(r9) 1286 1282 andi. r0,r11,MSR_PR 1287 - bne hcall_real_cont 1283 + bne guest_exit_cont 1288 1284 clrrdi r3,r3,2 1289 1285 cmpldi r3,hcall_real_table_end - hcall_real_table 1290 - bge hcall_real_cont 1286 + bge guest_exit_cont 1291 1287 LOAD_REG_ADDR(r4, hcall_real_table) 1292 1288 lwzx r3,r3,r4 1293 1289 cmpwi r3,0 1294 - beq hcall_real_cont 1290 + beq guest_exit_cont 1295 1291 add r3,r3,r4 1296 1292 mtctr r3 1297 1293 mr r3,r9 /* get vcpu pointer */ ··· 1312 1308 li r12,BOOK3S_INTERRUPT_SYSCALL 1313 1309 ld r9, HSTATE_KVM_VCPU(r13) 1314 1310 1315 - b hcall_real_cont 1311 + b guest_exit_cont 1316 1312 1317 1313 .globl hcall_real_table 1318 1314 hcall_real_table: ··· 1570 1566 kvm_cede_exit: 1571 1567 li r3,H_TOO_HARD 1572 1568 blr 1569 + 1570 + /* Try to handle a machine check in real mode */ 1571 + machine_check_realmode: 1572 + mr r3, r9 /* get vcpu pointer */ 1573 + bl .kvmppc_realmode_machine_check 1574 + nop 1575 + cmpdi r3, 0 /* continue exiting from guest? */ 1576 + ld r9, HSTATE_KVM_VCPU(r13) 1577 + li r12, BOOK3S_INTERRUPT_MACHINE_CHECK 1578 + beq mc_cont 1579 + /* If not, deliver a machine check. SRR0/1 are already set */ 1580 + li r10, BOOK3S_INTERRUPT_MACHINE_CHECK 1581 + li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ 1582 + rotldi r11, r11, 63 1583 + b fast_interrupt_c_return 1573 1584 1574 1585 secondary_too_late: 1575 1586 ld r5,HSTATE_KVM_VCORE(r13)