Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

powerpc/book3s: Queue up and process delayed MCE events.

When machine check real mode handler can not continue into host kernel
in V mode, it returns from the interrupt and we loose MCE event which
never gets logged. In such a situation queue up the MCE event so that
we can log it later when we get back into host kernel with r1 pointing to
kernel stack e.g. during syscall exit.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

authored by

Mahesh Salgaonkar and committed by
Benjamin Herrenschmidt
b5ff4211 36df96f8

+168 -98
+3
arch/powerpc/include/asm/mce.h
··· 190 190 struct mce_error_info *mce_err, uint64_t addr); 191 191 extern int get_mce_event(struct machine_check_event *mce, bool release); 192 192 extern void release_mce_event(void); 193 + extern void machine_check_queue_event(void); 194 + extern void machine_check_process_queued_event(void); 195 + extern void machine_check_print_event_info(struct machine_check_event *evt); 193 196 194 197 #endif /* __ASM_PPC64_MCE_H__ */
+5
arch/powerpc/kernel/entry_64.S
··· 184 184 bl .do_show_syscall_exit 185 185 ld r3,RESULT(r1) 186 186 #endif 187 + #ifdef CONFIG_PPC_BOOK3S_64 188 + BEGIN_FTR_SECTION 189 + bl .machine_check_process_queued_event 190 + END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) 191 + #endif 187 192 CURRENT_THREAD_INFO(r12, r1) 188 193 189 194 ld r8,_MSR(r1)
+5 -2
arch/powerpc/kernel/exceptions-64s.S
··· 855 855 /* Supervisor state loss */ 856 856 li r0,1 857 857 stb r0,PACA_NAPSTATELOST(r13) 858 - 3: MACHINE_CHECK_HANDLER_WINDUP 858 + 3: bl .machine_check_queue_event 859 + MACHINE_CHECK_HANDLER_WINDUP 859 860 GET_PACA(r13) 860 861 ld r1,PACAR1(r13) 861 862 b .power7_enter_nap_mode ··· 896 895 2: 897 896 /* 898 897 * Return from MC interrupt. 899 - * TODO: Queue up the MCE event so that we can log it later. 898 + * Queue up the MCE event so that we can log it later, while 899 + * returning from kernel or opal call. 900 900 */ 901 + bl .machine_check_queue_event 901 902 MACHINE_CHECK_HANDLER_WINDUP 902 903 rfid 903 904 9:
+154
arch/powerpc/kernel/mce.c
··· 31 31 static DEFINE_PER_CPU(int, mce_nest_count); 32 32 static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event); 33 33 34 + /* Queue for delayed MCE events. */ 35 + static DEFINE_PER_CPU(int, mce_queue_count); 36 + static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue); 37 + 34 38 static void mce_set_error_info(struct machine_check_event *mce, 35 39 struct mce_error_info *mce_err) 36 40 { ··· 165 161 void release_mce_event(void) 166 162 { 167 163 get_mce_event(NULL, true); 164 + } 165 + 166 + /* 167 + * Queue up the MCE event which then can be handled later. 168 + */ 169 + void machine_check_queue_event(void) 170 + { 171 + int index; 172 + struct machine_check_event evt; 173 + 174 + if (!get_mce_event(&evt, MCE_EVENT_RELEASE)) 175 + return; 176 + 177 + index = __get_cpu_var(mce_queue_count)++; 178 + /* If queue is full, just return for now. */ 179 + if (index >= MAX_MC_EVT) { 180 + __get_cpu_var(mce_queue_count)--; 181 + return; 182 + } 183 + __get_cpu_var(mce_event_queue[index]) = evt; 184 + } 185 + 186 + /* 187 + * process pending MCE event from the mce event queue. This function will be 188 + * called during syscall exit. 189 + */ 190 + void machine_check_process_queued_event(void) 191 + { 192 + int index; 193 + 194 + preempt_disable(); 195 + /* 196 + * For now just print it to console. 197 + * TODO: log this error event to FSP or nvram. 198 + */ 199 + while (__get_cpu_var(mce_queue_count) > 0) { 200 + index = __get_cpu_var(mce_queue_count) - 1; 201 + machine_check_print_event_info( 202 + &__get_cpu_var(mce_event_queue[index])); 203 + __get_cpu_var(mce_queue_count)--; 204 + } 205 + preempt_enable(); 206 + } 207 + 208 + void machine_check_print_event_info(struct machine_check_event *evt) 209 + { 210 + const char *level, *sevstr, *subtype; 211 + static const char *mc_ue_types[] = { 212 + "Indeterminate", 213 + "Instruction fetch", 214 + "Page table walk ifetch", 215 + "Load/Store", 216 + "Page table walk Load/Store", 217 + }; 218 + static const char *mc_slb_types[] = { 219 + "Indeterminate", 220 + "Parity", 221 + "Multihit", 222 + }; 223 + static const char *mc_erat_types[] = { 224 + "Indeterminate", 225 + "Parity", 226 + "Multihit", 227 + }; 228 + static const char *mc_tlb_types[] = { 229 + "Indeterminate", 230 + "Parity", 231 + "Multihit", 232 + }; 233 + 234 + /* Print things out */ 235 + if (evt->version != MCE_V1) { 236 + pr_err("Machine Check Exception, Unknown event version %d !\n", 237 + evt->version); 238 + return; 239 + } 240 + switch (evt->severity) { 241 + case MCE_SEV_NO_ERROR: 242 + level = KERN_INFO; 243 + sevstr = "Harmless"; 244 + break; 245 + case MCE_SEV_WARNING: 246 + level = KERN_WARNING; 247 + sevstr = ""; 248 + break; 249 + case MCE_SEV_ERROR_SYNC: 250 + level = KERN_ERR; 251 + sevstr = "Severe"; 252 + break; 253 + case MCE_SEV_FATAL: 254 + default: 255 + level = KERN_ERR; 256 + sevstr = "Fatal"; 257 + break; 258 + } 259 + 260 + printk("%s%s Machine check interrupt [%s]\n", level, sevstr, 261 + evt->disposition == MCE_DISPOSITION_RECOVERED ? 262 + "Recovered" : "[Not recovered"); 263 + printk("%s Initiator: %s\n", level, 264 + evt->initiator == MCE_INITIATOR_CPU ? "CPU" : "Unknown"); 265 + switch (evt->error_type) { 266 + case MCE_ERROR_TYPE_UE: 267 + subtype = evt->u.ue_error.ue_error_type < 268 + ARRAY_SIZE(mc_ue_types) ? 269 + mc_ue_types[evt->u.ue_error.ue_error_type] 270 + : "Unknown"; 271 + printk("%s Error type: UE [%s]\n", level, subtype); 272 + if (evt->u.ue_error.effective_address_provided) 273 + printk("%s Effective address: %016llx\n", 274 + level, evt->u.ue_error.effective_address); 275 + if (evt->u.ue_error.physical_address_provided) 276 + printk("%s Physial address: %016llx\n", 277 + level, evt->u.ue_error.physical_address); 278 + break; 279 + case MCE_ERROR_TYPE_SLB: 280 + subtype = evt->u.slb_error.slb_error_type < 281 + ARRAY_SIZE(mc_slb_types) ? 282 + mc_slb_types[evt->u.slb_error.slb_error_type] 283 + : "Unknown"; 284 + printk("%s Error type: SLB [%s]\n", level, subtype); 285 + if (evt->u.slb_error.effective_address_provided) 286 + printk("%s Effective address: %016llx\n", 287 + level, evt->u.slb_error.effective_address); 288 + break; 289 + case MCE_ERROR_TYPE_ERAT: 290 + subtype = evt->u.erat_error.erat_error_type < 291 + ARRAY_SIZE(mc_erat_types) ? 292 + mc_erat_types[evt->u.erat_error.erat_error_type] 293 + : "Unknown"; 294 + printk("%s Error type: ERAT [%s]\n", level, subtype); 295 + if (evt->u.erat_error.effective_address_provided) 296 + printk("%s Effective address: %016llx\n", 297 + level, evt->u.erat_error.effective_address); 298 + break; 299 + case MCE_ERROR_TYPE_TLB: 300 + subtype = evt->u.tlb_error.tlb_error_type < 301 + ARRAY_SIZE(mc_tlb_types) ? 302 + mc_tlb_types[evt->u.tlb_error.tlb_error_type] 303 + : "Unknown"; 304 + printk("%s Error type: TLB [%s]\n", level, subtype); 305 + if (evt->u.tlb_error.effective_address_provided) 306 + printk("%s Effective address: %016llx\n", 307 + level, evt->u.tlb_error.effective_address); 308 + break; 309 + default: 310 + case MCE_ERROR_TYPE_UNKNOWN: 311 + printk("%s Error type: Unknown\n", level); 312 + break; 313 + } 168 314 }
+1 -96
arch/powerpc/platforms/powernv/opal.c
··· 258 258 int opal_machine_check(struct pt_regs *regs) 259 259 { 260 260 struct machine_check_event evt; 261 - const char *level, *sevstr, *subtype; 262 - static const char *opal_mc_ue_types[] = { 263 - "Indeterminate", 264 - "Instruction fetch", 265 - "Page table walk ifetch", 266 - "Load/Store", 267 - "Page table walk Load/Store", 268 - }; 269 - static const char *opal_mc_slb_types[] = { 270 - "Indeterminate", 271 - "Parity", 272 - "Multihit", 273 - }; 274 - static const char *opal_mc_erat_types[] = { 275 - "Indeterminate", 276 - "Parity", 277 - "Multihit", 278 - }; 279 - static const char *opal_mc_tlb_types[] = { 280 - "Indeterminate", 281 - "Parity", 282 - "Multihit", 283 - }; 284 261 285 262 if (!get_mce_event(&evt, MCE_EVENT_RELEASE)) 286 263 return 0; ··· 268 291 evt.version); 269 292 return 0; 270 293 } 271 - switch(evt.severity) { 272 - case MCE_SEV_NO_ERROR: 273 - level = KERN_INFO; 274 - sevstr = "Harmless"; 275 - break; 276 - case MCE_SEV_WARNING: 277 - level = KERN_WARNING; 278 - sevstr = ""; 279 - break; 280 - case MCE_SEV_ERROR_SYNC: 281 - level = KERN_ERR; 282 - sevstr = "Severe"; 283 - break; 284 - case MCE_SEV_FATAL: 285 - default: 286 - level = KERN_ERR; 287 - sevstr = "Fatal"; 288 - break; 289 - } 294 + machine_check_print_event_info(&evt); 290 295 291 - printk("%s%s Machine check interrupt [%s]\n", level, sevstr, 292 - evt.disposition == MCE_DISPOSITION_RECOVERED ? 293 - "Recovered" : "[Not recovered"); 294 - printk("%s Initiator: %s\n", level, 295 - evt.initiator == MCE_INITIATOR_CPU ? "CPU" : "Unknown"); 296 - switch(evt.error_type) { 297 - case MCE_ERROR_TYPE_UE: 298 - subtype = evt.u.ue_error.ue_error_type < 299 - ARRAY_SIZE(opal_mc_ue_types) ? 300 - opal_mc_ue_types[evt.u.ue_error.ue_error_type] 301 - : "Unknown"; 302 - printk("%s Error type: UE [%s]\n", level, subtype); 303 - if (evt.u.ue_error.effective_address_provided) 304 - printk("%s Effective address: %016llx\n", 305 - level, evt.u.ue_error.effective_address); 306 - if (evt.u.ue_error.physical_address_provided) 307 - printk("%s Physial address: %016llx\n", 308 - level, evt.u.ue_error.physical_address); 309 - break; 310 - case MCE_ERROR_TYPE_SLB: 311 - subtype = evt.u.slb_error.slb_error_type < 312 - ARRAY_SIZE(opal_mc_slb_types) ? 313 - opal_mc_slb_types[evt.u.slb_error.slb_error_type] 314 - : "Unknown"; 315 - printk("%s Error type: SLB [%s]\n", level, subtype); 316 - if (evt.u.slb_error.effective_address_provided) 317 - printk("%s Effective address: %016llx\n", 318 - level, evt.u.slb_error.effective_address); 319 - break; 320 - case MCE_ERROR_TYPE_ERAT: 321 - subtype = evt.u.erat_error.erat_error_type < 322 - ARRAY_SIZE(opal_mc_erat_types) ? 323 - opal_mc_erat_types[evt.u.erat_error.erat_error_type] 324 - : "Unknown"; 325 - printk("%s Error type: ERAT [%s]\n", level, subtype); 326 - if (evt.u.erat_error.effective_address_provided) 327 - printk("%s Effective address: %016llx\n", 328 - level, evt.u.erat_error.effective_address); 329 - break; 330 - case MCE_ERROR_TYPE_TLB: 331 - subtype = evt.u.tlb_error.tlb_error_type < 332 - ARRAY_SIZE(opal_mc_tlb_types) ? 333 - opal_mc_tlb_types[evt.u.tlb_error.tlb_error_type] 334 - : "Unknown"; 335 - printk("%s Error type: TLB [%s]\n", level, subtype); 336 - if (evt.u.tlb_error.effective_address_provided) 337 - printk("%s Effective address: %016llx\n", 338 - level, evt.u.tlb_error.effective_address); 339 - break; 340 - default: 341 - case MCE_ERROR_TYPE_UNKNOWN: 342 - printk("%s Error type: Unknown\n", level); 343 - break; 344 - } 345 296 return evt.severity == MCE_SEV_FATAL ? 0 : 1; 346 297 } 347 298