Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

powerpc/powernv: Machine check exception handling.

Add basic error handling in machine check exception handler.

- If MSR_RI isn't set, we can not recover.
- Check if disposition set to OpalMCE_DISPOSITION_RECOVERED.
- Check if address at fault is inside kernel address space, if not then send
SIGBUS to process if we hit exception when in userspace.
- If address at fault is not provided then and if we get a synchronous machine
check while in userspace then kill the task.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

authored by

Mahesh Salgaonkar and committed by
Benjamin Herrenschmidt
b63a0ffe 28446de2

+70 -1
+1
arch/powerpc/include/asm/mce.h
··· 193 193 extern void machine_check_queue_event(void); 194 194 extern void machine_check_process_queued_event(void); 195 195 extern void machine_check_print_event_info(struct machine_check_event *evt); 196 + extern uint64_t get_mce_fault_addr(struct machine_check_event *evt); 196 197 197 198 #endif /* __ASM_PPC64_MCE_H__ */
+27
arch/powerpc/kernel/mce.c
··· 316 316 break; 317 317 } 318 318 } 319 + 320 + uint64_t get_mce_fault_addr(struct machine_check_event *evt) 321 + { 322 + switch (evt->error_type) { 323 + case MCE_ERROR_TYPE_UE: 324 + if (evt->u.ue_error.effective_address_provided) 325 + return evt->u.ue_error.effective_address; 326 + break; 327 + case MCE_ERROR_TYPE_SLB: 328 + if (evt->u.slb_error.effective_address_provided) 329 + return evt->u.slb_error.effective_address; 330 + break; 331 + case MCE_ERROR_TYPE_ERAT: 332 + if (evt->u.erat_error.effective_address_provided) 333 + return evt->u.erat_error.effective_address; 334 + break; 335 + case MCE_ERROR_TYPE_TLB: 336 + if (evt->u.tlb_error.effective_address_provided) 337 + return evt->u.tlb_error.effective_address; 338 + break; 339 + default: 340 + case MCE_ERROR_TYPE_UNKNOWN: 341 + break; 342 + } 343 + return 0; 344 + } 345 + EXPORT_SYMBOL(get_mce_fault_addr);
+42 -1
arch/powerpc/platforms/powernv/opal.c
··· 18 18 #include <linux/interrupt.h> 19 19 #include <linux/notifier.h> 20 20 #include <linux/slab.h> 21 + #include <linux/sched.h> 21 22 #include <linux/kobject.h> 22 23 #include <asm/opal.h> 23 24 #include <asm/firmware.h> ··· 252 251 return written; 253 252 } 254 253 254 + static int opal_recover_mce(struct pt_regs *regs, 255 + struct machine_check_event *evt) 256 + { 257 + int recovered = 0; 258 + uint64_t ea = get_mce_fault_addr(evt); 259 + 260 + if (!(regs->msr & MSR_RI)) { 261 + /* If MSR_RI isn't set, we cannot recover */ 262 + recovered = 0; 263 + } else if (evt->disposition == MCE_DISPOSITION_RECOVERED) { 264 + /* Platform corrected itself */ 265 + recovered = 1; 266 + } else if (ea && !is_kernel_addr(ea)) { 267 + /* 268 + * Faulting address is not in kernel text. We should be fine. 269 + * We need to find which process uses this address. 270 + * For now, kill the task if we have received exception when 271 + * in userspace. 272 + * 273 + * TODO: Queue up this address for hwpoisioning later. 274 + */ 275 + if (user_mode(regs) && !is_global_init(current)) { 276 + _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); 277 + recovered = 1; 278 + } else 279 + recovered = 0; 280 + } else if (user_mode(regs) && !is_global_init(current) && 281 + evt->severity == MCE_SEV_ERROR_SYNC) { 282 + /* 283 + * If we have received a synchronous error when in userspace 284 + * kill the task. 285 + */ 286 + _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); 287 + recovered = 1; 288 + } 289 + return recovered; 290 + } 291 + 255 292 int opal_machine_check(struct pt_regs *regs) 256 293 { 257 294 struct machine_check_event evt; ··· 305 266 } 306 267 machine_check_print_event_info(&evt); 307 268 308 - return evt.severity == MCE_SEV_FATAL ? 0 : 1; 269 + if (opal_recover_mce(regs, &evt)) 270 + return 1; 271 + return 0; 309 272 } 310 273 311 274 static irqreturn_t opal_interrupt(int irq, void *data)