[IA64] Proper handling of TLB errors from duplicate itr.d dropins

Jack Steiner noticed that duplicate TLB DTC entries do not cause a
linux panic. See discussion:

http://www.gelato.unsw.edu.au/archives/linux-ia64/0307/6108.html

The current TLB recovery code is recovering from the duplicate itr.d
dropins, masking the underlying problem. This change modifies
the MCA recovery code to look for the TLB check signature of the
duplicate TLB entry and panic in that case.

Signed-off-by: Russ Anderson (rja@sgi.com)
Signed-off-by: Tony Luck <tony.luck@intel.com>

authored by Russ Anderson and committed by Tony Luck 618b206f 908e0a8a

+36 -6
+2 -6
arch/ia64/kernel/mca.c
··· 1192 ia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw, 1193 struct ia64_sal_os_state *sos) 1194 { 1195 - pal_processor_state_info_t *psp = (pal_processor_state_info_t *) 1196 - &sos->proc_state_param; 1197 int recover, cpu = smp_processor_id(); 1198 struct task_struct *previous_current; 1199 struct ia64_mca_notify_die nd = ··· 1221 /* Get the MCA error record and log it */ 1222 ia64_mca_log_sal_error_record(SAL_INFO_TYPE_MCA); 1223 1224 - /* TLB error is only exist in this SAL error record */ 1225 - recover = (psp->tc && !(psp->cc || psp->bc || psp->rc || psp->uc)) 1226 - /* other error recovery */ 1227 - || (ia64_mca_ucmc_extension 1228 && ia64_mca_ucmc_extension( 1229 IA64_LOG_CURR_BUFFER(SAL_INFO_TYPE_MCA), 1230 sos));
··· 1192 ia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw, 1193 struct ia64_sal_os_state *sos) 1194 { 1195 int recover, cpu = smp_processor_id(); 1196 struct task_struct *previous_current; 1197 struct ia64_mca_notify_die nd = ··· 1223 /* Get the MCA error record and log it */ 1224 ia64_mca_log_sal_error_record(SAL_INFO_TYPE_MCA); 1225 1226 + /* MCA error recovery */ 1227 + recover = (ia64_mca_ucmc_extension 1228 && ia64_mca_ucmc_extension( 1229 IA64_LOG_CURR_BUFFER(SAL_INFO_TYPE_MCA), 1230 sos));
+33
arch/ia64/kernel/mca_drv.c
··· 607 return status; 608 } 609 610 /** 611 * recover_from_processor_error 612 * @platform: whether there are some platform error section or not ··· 677 */ 678 if (psp->us || psp->ci == 0) 679 return fatal_mca("error not contained"); 680 681 /* 682 * The cache check and bus check bits have four possible states
··· 607 return status; 608 } 609 610 + /* 611 + * recover_from_tlb_check 612 + * @peidx: pointer of index of processor error section 613 + * 614 + * Return value: 615 + * 1 on Success / 0 on Failure 616 + */ 617 + static int 618 + recover_from_tlb_check(peidx_table_t *peidx) 619 + { 620 + sal_log_mod_error_info_t *smei; 621 + pal_tlb_check_info_t *ptci; 622 + 623 + smei = (sal_log_mod_error_info_t *)peidx_tlb_check(peidx, 0); 624 + ptci = (pal_tlb_check_info_t *)&(smei->check_info); 625 + 626 + /* 627 + * Look for signature of a duplicate TLB DTC entry, which is 628 + * a SW bug and always fatal. 629 + */ 630 + if (ptci->op == PAL_TLB_CHECK_OP_PURGE 631 + && !(ptci->itr || ptci->dtc || ptci->itc)) 632 + return fatal_mca("Duplicate TLB entry"); 633 + 634 + return mca_recovered("TLB check recovered"); 635 + } 636 + 637 /** 638 * recover_from_processor_error 639 * @platform: whether there are some platform error section or not ··· 650 */ 651 if (psp->us || psp->ci == 0) 652 return fatal_mca("error not contained"); 653 + 654 + /* 655 + * Look for recoverable TLB check 656 + */ 657 + if (psp->tc && !(psp->cc || psp->bc || psp->rc || psp->uc)) 658 + return recover_from_tlb_check(peidx); 659 660 /* 661 * The cache check and bus check bits have four possible states
+1
include/asm-ia64/pal.h
··· 371 * dependent 372 */ 373 374 375 typedef struct pal_process_state_info_s { 376 u64 reserved1 : 2,
··· 371 * dependent 372 */ 373 374 + #define PAL_TLB_CHECK_OP_PURGE 8 375 376 typedef struct pal_process_state_info_s { 377 u64 reserved1 : 2,