Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

[IA64] MCA recovery: kernel context recovery table

Memory errors encountered by user applications may surface
when the CPU is running in kernel context. The current code
will not attempt recovery if the MCA surfaces in kernel
context (privilage mode 0). This patch adds a check for cases
where the user initiated the load that surfaces in kernel
interrupt code.

An example is a user process lauching a load from memory
and the data in memory had bad ECC. Before the bad data
gets to the CPU register, and interrupt comes in. The
code jumps to the IVT interrupt entry point and begins
execution in kernel context. The process of saving the
user registers (SAVE_REST) causes the bad data to be loaded
into a CPU register, triggering the MCA. The MCA surfaces in
kernel context, even though the load was initiated from
user context.

As suggested by David and Tony, this patch uses an exception
table like approach, puting the tagged recovery addresses in
a searchable table. One difference from the exception table
is that MCAs do not surface in precise places (such as with
a TLB miss), so instead of tagging specific instructions,
address ranges are registers. A single macro is used to do
the tagging, with the input parameter being the label
of the starting address and the macro being the ending
address. This limits clutter in the code.

This patch only tags one spot, the interrupt ivt entry.
Testing showed that spot to be a "heavy hitter" with
MCAs surfacing while saving user registers. Other spots
can be added as needed by adding a single macro.

Signed-off-by: Russ Anderson (rja@sgi.com)
Signed-off-by: Tony Luck <tony.luck@intel.com>

authored by

Russ Anderson and committed by
Tony Luck
d2a28ad9 a5b00bb4

+120 -41
+1
arch/ia64/kernel/ivt.S
··· 865 865 ;; 866 866 SAVE_REST 867 867 ;; 868 + MCA_RECOVER_RANGE(interrupt) 868 869 alloc r14=ar.pfs,0,0,2,0 // must be first in an insn group 869 870 mov out0=cr.ivr // pass cr.ivr as first arg 870 871 add out1=16,sp // pass pointer to pt_regs as second arg
+73 -25
arch/ia64/kernel/mca.c
··· 83 83 #include <asm/irq.h> 84 84 #include <asm/hw_irq.h> 85 85 86 + #include "mca_drv.h" 86 87 #include "entry.h" 87 88 88 89 #if defined(IA64_MCA_DEBUG_INFO) ··· 281 280 if (rh->severity == sal_log_severity_corrected) 282 281 ia64_sal_clear_state_info(sal_info_type); 283 282 } 283 + 284 + /* 285 + * search_mca_table 286 + * See if the MCA surfaced in an instruction range 287 + * that has been tagged as recoverable. 288 + * 289 + * Inputs 290 + * first First address range to check 291 + * last Last address range to check 292 + * ip Instruction pointer, address we are looking for 293 + * 294 + * Return value: 295 + * 1 on Success (in the table)/ 0 on Failure (not in the table) 296 + */ 297 + int 298 + search_mca_table (const struct mca_table_entry *first, 299 + const struct mca_table_entry *last, 300 + unsigned long ip) 301 + { 302 + const struct mca_table_entry *curr; 303 + u64 curr_start, curr_end; 304 + 305 + curr = first; 306 + while (curr <= last) { 307 + curr_start = (u64) &curr->start_addr + curr->start_addr; 308 + curr_end = (u64) &curr->end_addr + curr->end_addr; 309 + 310 + if ((ip >= curr_start) && (ip <= curr_end)) { 311 + return 1; 312 + } 313 + curr++; 314 + } 315 + return 0; 316 + } 317 + 318 + /* Given an address, look for it in the mca tables. */ 319 + int mca_recover_range(unsigned long addr) 320 + { 321 + extern struct mca_table_entry __start___mca_table[]; 322 + extern struct mca_table_entry __stop___mca_table[]; 323 + 324 + return search_mca_table(__start___mca_table, __stop___mca_table-1, addr); 325 + } 326 + EXPORT_SYMBOL_GPL(mca_recover_range); 284 327 285 328 #ifdef CONFIG_ACPI 286 329 ··· 792 747 ia64_mca_modify_comm(previous_current); 793 748 goto no_mod; 794 749 } 795 - if (r13 != sos->prev_IA64_KR_CURRENT) { 796 - msg = "inconsistent previous current and r13"; 797 - goto no_mod; 798 - } 799 - if ((r12 - r13) >= KERNEL_STACK_SIZE) { 800 - msg = "inconsistent r12 and r13"; 801 - goto no_mod; 802 - } 803 - if ((ar_bspstore - r13) >= KERNEL_STACK_SIZE) { 804 - msg = "inconsistent ar.bspstore and r13"; 805 - goto no_mod; 806 - } 807 - va.p = old_bspstore; 808 - if (va.f.reg < 5) { 809 - msg = "old_bspstore is in the wrong region"; 810 - goto no_mod; 811 - } 812 - if ((ar_bsp - r13) >= KERNEL_STACK_SIZE) { 813 - msg = "inconsistent ar.bsp and r13"; 814 - goto no_mod; 815 - } 816 - size += (ia64_rse_skip_regs(old_bspstore, slots) - old_bspstore) * 8; 817 - if (ar_bspstore + size > r12) { 818 - msg = "no room for blocked state"; 819 - goto no_mod; 750 + 751 + if (!mca_recover_range(ms->pmsa_iip)) { 752 + if (r13 != sos->prev_IA64_KR_CURRENT) { 753 + msg = "inconsistent previous current and r13"; 754 + goto no_mod; 755 + } 756 + if ((r12 - r13) >= KERNEL_STACK_SIZE) { 757 + msg = "inconsistent r12 and r13"; 758 + goto no_mod; 759 + } 760 + if ((ar_bspstore - r13) >= KERNEL_STACK_SIZE) { 761 + msg = "inconsistent ar.bspstore and r13"; 762 + goto no_mod; 763 + } 764 + va.p = old_bspstore; 765 + if (va.f.reg < 5) { 766 + msg = "old_bspstore is in the wrong region"; 767 + goto no_mod; 768 + } 769 + if ((ar_bsp - r13) >= KERNEL_STACK_SIZE) { 770 + msg = "inconsistent ar.bsp and r13"; 771 + goto no_mod; 772 + } 773 + size += (ia64_rse_skip_regs(old_bspstore, slots) - old_bspstore) * 8; 774 + if (ar_bspstore + size > r12) { 775 + msg = "no room for blocked state"; 776 + goto no_mod; 777 + } 820 778 } 821 779 822 780 ia64_mca_modify_comm(previous_current);
+15 -7
arch/ia64/kernel/mca_drv.c
··· 6 6 * Copyright (C) Hidetoshi Seto (seto.hidetoshi@jp.fujitsu.com) 7 7 * Copyright (C) 2005 Silicon Graphics, Inc 8 8 * Copyright (C) 2005 Keith Owens <kaos@sgi.com> 9 + * Copyright (C) 2006 Russ Anderson <rja@sgi.com> 9 10 */ 10 11 #include <linux/config.h> 11 12 #include <linux/types.h> ··· 122 121 */ 123 122 124 123 void 125 - mca_handler_bh(unsigned long paddr) 124 + mca_handler_bh(unsigned long paddr, void *iip, unsigned long ipsr) 126 125 { 127 - printk(KERN_ERR 128 - "OS_MCA: process [pid: %d](%s) encounters MCA (paddr=%lx)\n", 129 - current->pid, current->comm, paddr); 126 + printk(KERN_ERR "OS_MCA: process [cpu %d, pid: %d, uid: %d, " 127 + "iip: %p, psr: 0x%lx,paddr: 0x%lx](%s) encounters MCA.\n", 128 + raw_smp_processor_id(), current->pid, current->uid, 129 + iip, ipsr, paddr, current->comm); 130 130 131 131 spin_lock(&mca_bh_lock); 132 132 switch (mca_page_isolate(paddr)) { ··· 444 442 if (!peidx_bottom(peidx) || !(peidx_bottom(peidx)->valid.minstate)) 445 443 return 0; 446 444 psr1 =(struct ia64_psr *)&(peidx_minstate_area(peidx)->pmsa_ipsr); 445 + psr2 =(struct ia64_psr *)&(peidx_minstate_area(peidx)->pmsa_xpsr); 447 446 448 447 /* 449 448 * Check the privilege level of interrupted context. 450 449 * If it is user-mode, then terminate affected process. 451 450 */ 452 - if (psr1->cpl != 0) { 451 + 452 + pmsa = sos->pal_min_state; 453 + if (psr1->cpl != 0 || 454 + ((psr2->cpl != 0) && mca_recover_range(pmsa->pmsa_iip))) { 453 455 smei = peidx_bus_check(peidx, 0); 454 456 if (smei->valid.target_identifier) { 455 457 /* 456 458 * setup for resume to bottom half of MCA, 457 459 * "mca_handler_bhhook" 458 460 */ 459 - pmsa = sos->pal_min_state; 460 - /* pass to bhhook as 1st argument (gr8) */ 461 + /* pass to bhhook as argument (gr8, ...) */ 461 462 pmsa->pmsa_gr[8-1] = smei->target_identifier; 463 + pmsa->pmsa_gr[9-1] = pmsa->pmsa_iip; 464 + pmsa->pmsa_gr[10-1] = pmsa->pmsa_ipsr; 462 465 /* set interrupted return address (but no use) */ 463 466 pmsa->pmsa_br0 = pmsa->pmsa_iip; 464 467 /* change resume address to bottom half */ ··· 473 466 psr2 = (struct ia64_psr *)&pmsa->pmsa_ipsr; 474 467 psr2->cpl = 0; 475 468 psr2->ri = 0; 469 + psr2->bn = 1; 476 470 psr2->i = 0; 477 471 478 472 return 1;
+7
arch/ia64/kernel/mca_drv.h
··· 111 111 slidx_foreach_entry(__pos, &((slidx)->sec)) { __count++; }\ 112 112 __count; }) 113 113 114 + struct mca_table_entry { 115 + int start_addr; /* location-relative starting address of MCA recoverable range */ 116 + int end_addr; /* location-relative ending address of MCA recoverable range */ 117 + }; 118 + 119 + extern const struct mca_table_entry *search_mca_tables (unsigned long addr); 120 + extern int mca_recover_range(unsigned long);
+4 -9
arch/ia64/kernel/mca_drv_asm.S
··· 14 14 15 15 GLOBAL_ENTRY(mca_handler_bhhook) 16 16 invala // clear RSE ? 17 - ;; 18 17 cover 19 18 ;; 20 19 clrrrb 21 20 ;; 22 - alloc r16=ar.pfs,0,2,1,0 // make a new frame 23 - ;; 21 + alloc r16=ar.pfs,0,2,3,0 // make a new frame 24 22 mov ar.rsc=0 25 - ;; 26 23 mov r13=IA64_KR(CURRENT) // current task pointer 27 24 ;; 28 25 mov r2=r13 ··· 27 30 addl r22=IA64_RBS_OFFSET,r2 28 31 ;; 29 32 mov ar.bspstore=r22 30 - ;; 31 33 addl sp=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r2 32 34 ;; 33 35 adds r2=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13 ··· 36 40 movl loc1=mca_handler_bh // recovery C function 37 41 ;; 38 42 mov out0=r8 // poisoned address 43 + mov out1=r9 // iip 44 + mov out2=r10 // psr 39 45 mov b6=loc1 40 46 ;; 41 47 mov loc1=rp 42 - ;; 43 - ssm psr.i 44 - ;; 48 + ssm psr.i | psr.ic 45 49 br.call.sptk.many rp=b6 // does not return ... 46 50 ;; 47 51 mov ar.pfs=loc0 ··· 49 53 ;; 50 54 mov r8=r0 51 55 br.ret.sptk.many rp 52 - ;; 53 56 END(mca_handler_bhhook)
+9
arch/ia64/kernel/vmlinux.lds.S
··· 130 130 __initcall_end = .; 131 131 } 132 132 133 + /* MCA table */ 134 + . = ALIGN(16); 135 + __mca_table : AT(ADDR(__mca_table) - LOAD_OFFSET) 136 + { 137 + __start___mca_table = .; 138 + *(__mca_table) 139 + __stop___mca_table = .; 140 + } 141 + 133 142 .data.patch.vtop : AT(ADDR(.data.patch.vtop) - LOAD_OFFSET) 134 143 { 135 144 __start___vtop_patchlist = .;
+11
include/asm-ia64/asmmacro.h
··· 51 51 [99:] x 52 52 53 53 /* 54 + * Tag MCA recoverable instruction ranges. 55 + */ 56 + 57 + .section "__mca_table", "a" // declare section & section attributes 58 + .previous 59 + 60 + # define MCA_RECOVER_RANGE(y) \ 61 + .xdata4 "__mca_table", y-., 99f-.; \ 62 + [99:] 63 + 64 + /* 54 65 * Mark instructions that need a load of a virtual address patched to be 55 66 * a load of a physical address. We use this either in critical performance 56 67 * path (ivt.S - TLB miss processing) or in places where it might not be