Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

x86/mce: Fix all mce notifiers to update the mce->kflags bitmask

If the handler took any action to log or deal with the error, set a bit
in mce->kflags so that the default handler on the end of the machine
check chain can see what has been done.

Get rid of NOTIFY_STOP returns. Make the EDAC and dev-mcelog handlers
skip over errors already processed by CEC.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Tested-by: Tony Luck <tony.luck@intel.com>
Link: https://lkml.kernel.org/r/20200214222720.13168-5-tony.luck@intel.com

authored by

Tony Luck and committed by
Borislav Petkov
23ba710a 1de08dcc

+37 -12
+3 -1
arch/x86/kernel/cpu/mce/core.c
··· 581 581 return NOTIFY_DONE; 582 582 583 583 pfn = mce->addr >> PAGE_SHIFT; 584 - if (!memory_failure(pfn, 0)) 584 + if (!memory_failure(pfn, 0)) { 585 585 set_mce_nospec(pfn); 586 + mce->kflags |= MCE_HANDLED_UC; 587 + } 586 588 587 589 return NOTIFY_OK; 588 590 }
+5
arch/x86/kernel/cpu/mce/dev-mcelog.c
··· 39 39 struct mce *mce = (struct mce *)data; 40 40 unsigned int entry; 41 41 42 + if (mce->kflags & MCE_HANDLED_CEC) 43 + return NOTIFY_DONE; 44 + 42 45 mutex_lock(&mce_chrdev_read_mutex); 43 46 44 47 entry = mcelog->next; ··· 59 56 60 57 memcpy(mcelog->entry + entry, mce, sizeof(struct mce)); 61 58 mcelog->entry[entry].finished = 1; 59 + mcelog->entry[entry].kflags = 0; 62 60 63 61 /* wake processes polling /dev/mcelog */ 64 62 wake_up_interruptible(&mce_chrdev_wait); ··· 67 63 unlock: 68 64 mutex_unlock(&mce_chrdev_read_mutex); 69 65 66 + mce->kflags |= MCE_HANDLED_MCELOG; 70 67 return NOTIFY_OK; 71 68 } 72 69
+3 -2
drivers/acpi/acpi_extlog.c
··· 146 146 static u32 err_seq; 147 147 148 148 estatus = extlog_elog_entry_check(cpu, bank); 149 - if (estatus == NULL) 149 + if (estatus == NULL || (mce->kflags & MCE_HANDLED_CEC)) 150 150 return NOTIFY_DONE; 151 151 152 152 memcpy(elog_buf, (void *)estatus, ELOG_ENTRY_LEN); ··· 176 176 } 177 177 178 178 out: 179 - return NOTIFY_STOP; 179 + mce->kflags |= MCE_HANDLED_EXTLOG; 180 + return NOTIFY_OK; 180 181 } 181 182 182 183 static bool __init extlog_get_l1addr(void)
+1
drivers/acpi/nfit/mce.c
··· 76 76 */ 77 77 acpi_nfit_ars_rescan(acpi_desc, 0); 78 78 } 79 + mce->kflags |= MCE_HANDLED_NFIT; 79 80 break; 80 81 } 81 82
+3 -2
drivers/edac/i7core_edac.c
··· 1815 1815 struct mem_ctl_info *mci; 1816 1816 1817 1817 i7_dev = get_i7core_dev(mce->socketid); 1818 - if (!i7_dev) 1818 + if (!i7_dev || (mce->kflags & MCE_HANDLED_CEC)) 1819 1819 return NOTIFY_DONE; 1820 1820 1821 1821 mci = i7_dev->mci; ··· 1834 1834 i7core_check_error(mci, mce); 1835 1835 1836 1836 /* Advise mcelog that the errors were handled */ 1837 - return NOTIFY_STOP; 1837 + mce->kflags |= MCE_HANDLED_EDAC; 1838 + return NOTIFY_OK; 1838 1839 } 1839 1840 1840 1841 static struct notifier_block i7_mce_dec = {
+5 -1
drivers/edac/mce_amd.c
··· 1046 1046 unsigned int fam = x86_family(m->cpuid); 1047 1047 int ecc; 1048 1048 1049 + if (m->kflags & MCE_HANDLED_CEC) 1050 + return NOTIFY_DONE; 1051 + 1049 1052 pr_emerg(HW_ERR "%s\n", decode_error_status(m)); 1050 1053 1051 1054 pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s", ··· 1149 1146 err_code: 1150 1147 amd_decode_err_code(m->status & 0xffff); 1151 1148 1152 - return NOTIFY_STOP; 1149 + m->kflags |= MCE_HANDLED_EDAC; 1150 + return NOTIFY_OK; 1153 1151 } 1154 1152 1155 1153 static struct notifier_block amd_mce_dec_nb = {
+3 -2
drivers/edac/pnd2_edac.c
··· 1400 1400 return NOTIFY_DONE; 1401 1401 1402 1402 mci = pnd2_mci; 1403 - if (!mci) 1403 + if (!mci || (mce->kflags & MCE_HANDLED_CEC)) 1404 1404 return NOTIFY_DONE; 1405 1405 1406 1406 /* ··· 1429 1429 pnd2_mce_output_error(mci, mce, &daddr); 1430 1430 1431 1431 /* Advice mcelog that the error were handled */ 1432 - return NOTIFY_STOP; 1432 + mce->kflags |= MCE_HANDLED_EDAC; 1433 + return NOTIFY_OK; 1433 1434 } 1434 1435 1435 1436 static struct notifier_block pnd2_mce_dec = {
+4 -1
drivers/edac/sb_edac.c
··· 3136 3136 3137 3137 if (edac_get_report_status() == EDAC_REPORTING_DISABLED) 3138 3138 return NOTIFY_DONE; 3139 + if (mce->kflags & MCE_HANDLED_CEC) 3140 + return NOTIFY_DONE; 3139 3141 3140 3142 /* 3141 3143 * Just let mcelog handle it if the error is ··· 3185 3183 sbridge_mce_output_error(mci, mce); 3186 3184 3187 3185 /* Advice mcelog that the error were handled */ 3188 - return NOTIFY_STOP; 3186 + mce->kflags |= MCE_HANDLED_EDAC; 3187 + return NOTIFY_OK; 3189 3188 } 3190 3189 3191 3190 static struct notifier_block sbridge_mce_dec = {
+4
drivers/edac/skx_common.c
··· 577 577 if (edac_get_report_status() == EDAC_REPORTING_DISABLED) 578 578 return NOTIFY_DONE; 579 579 580 + if (mce->kflags & MCE_HANDLED_CEC) 581 + return NOTIFY_DONE; 582 + 580 583 /* ignore unless this is memory related with an address */ 581 584 if ((mce->status & 0xefff) >> 7 != 1 || !(mce->status & MCI_STATUS_ADDRV)) 582 585 return NOTIFY_DONE; ··· 619 616 620 617 skx_mce_output_error(mci, mce, &res); 621 618 619 + mce->kflags |= MCE_HANDLED_EDAC; 622 620 return NOTIFY_DONE; 623 621 } 624 622
+6 -3
drivers/ras/cec.c
··· 538 538 /* We eat only correctable DRAM errors with usable addresses. */ 539 539 if (mce_is_memory_error(m) && 540 540 mce_is_correctable(m) && 541 - mce_usable_address(m)) 542 - if (!cec_add_elem(m->addr >> PAGE_SHIFT)) 543 - return NOTIFY_STOP; 541 + mce_usable_address(m)) { 542 + if (!cec_add_elem(m->addr >> PAGE_SHIFT)) { 543 + m->kflags |= MCE_HANDLED_CEC; 544 + return NOTIFY_OK; 545 + } 546 + } 544 547 545 548 return NOTIFY_DONE; 546 549 }