Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'edac_updates_for_v6.6' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras

Pull intel EDAC fixes from Tony Luck:

- Old igen6 driver could lose pending events during initialization

- Sapphire Rapids workstations have fewer memory controllers than their
bigger siblings. This confused the driver.

* tag 'edac_updates_for_v6.6' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras:
EDAC/igen6: Fix the issue of no error events
EDAC/i10nm: Skip the absent memory controllers

+53 -9
+49 -5
drivers/edac/i10nm_base.c
··· 658 658 return mdev; 659 659 } 660 660 661 + /** 662 + * i10nm_imc_absent() - Check whether the memory controller @imc is absent 663 + * 664 + * @imc : The pointer to the structure of memory controller EDAC device. 665 + * 666 + * RETURNS : true if the memory controller EDAC device is absent, false otherwise. 667 + */ 668 + static bool i10nm_imc_absent(struct skx_imc *imc) 669 + { 670 + u32 mcmtr; 671 + int i; 672 + 673 + switch (res_cfg->type) { 674 + case SPR: 675 + for (i = 0; i < res_cfg->ddr_chan_num; i++) { 676 + mcmtr = I10NM_GET_MCMTR(imc, i); 677 + edac_dbg(1, "ch%d mcmtr reg %x\n", i, mcmtr); 678 + if (mcmtr != ~0) 679 + return false; 680 + } 681 + 682 + /* 683 + * Some workstations' absent memory controllers still 684 + * appear as PCIe devices, misleading the EDAC driver. 685 + * By observing that the MMIO registers of these absent 686 + * memory controllers consistently hold the value of ~0. 687 + * 688 + * We identify a memory controller as absent by checking 689 + * if its MMIO register "mcmtr" == ~0 in all its channels. 690 + */ 691 + return true; 692 + default: 693 + return false; 694 + } 695 + } 696 + 661 697 static int i10nm_get_ddr_munits(void) 662 698 { 663 699 struct pci_dev *mdev; 664 700 void __iomem *mbase; 665 701 unsigned long size; 666 702 struct skx_dev *d; 667 - int i, j = 0; 703 + int i, lmc, j = 0; 668 704 u32 reg, off; 669 705 u64 base; 670 706 ··· 726 690 edac_dbg(2, "socket%d mmio base 0x%llx (reg 0x%x)\n", 727 691 j++, base, reg); 728 692 729 - for (i = 0; i < res_cfg->ddr_imc_num; i++) { 693 + for (lmc = 0, i = 0; i < res_cfg->ddr_imc_num; i++) { 730 694 mdev = get_ddr_munit(d, i, &off, &size); 731 695 732 696 if (i == 0 && !mdev) { ··· 735 699 } 736 700 if (!mdev) 737 701 continue; 738 - 739 - d->imc[i].mdev = mdev; 740 702 741 703 edac_dbg(2, "mc%d mmio base 0x%llx size 0x%lx (reg 0x%x)\n", 742 704 i, base + off, size, reg); ··· 746 712 return -ENODEV; 747 713 } 748 714 749 - d->imc[i].mbase = mbase; 715 + d->imc[lmc].mbase = mbase; 716 + if (i10nm_imc_absent(&d->imc[lmc])) { 717 + pci_dev_put(mdev); 718 + iounmap(mbase); 719 + d->imc[lmc].mbase = NULL; 720 + edac_dbg(2, "Skip absent mc%d\n", i); 721 + continue; 722 + } else { 723 + d->imc[lmc].mdev = mdev; 724 + lmc++; 725 + } 750 726 } 751 727 } 752 728
+4 -4
drivers/edac/igen6_edac.c
··· 27 27 #include "edac_mc.h" 28 28 #include "edac_module.h" 29 29 30 - #define IGEN6_REVISION "v2.5" 30 + #define IGEN6_REVISION "v2.5.1" 31 31 32 32 #define EDAC_MOD_STR "igen6_edac" 33 33 #define IGEN6_NMI_NAME "igen6_ibecc" ··· 1216 1216 INIT_WORK(&ecclog_work, ecclog_work_cb); 1217 1217 init_irq_work(&ecclog_irq_work, ecclog_irq_work_cb); 1218 1218 1219 - /* Check if any pending errors before registering the NMI handler */ 1220 - ecclog_handler(); 1221 - 1222 1219 rc = register_err_handler(); 1223 1220 if (rc) 1224 1221 goto fail3; ··· 1226 1229 igen6_printk(KERN_ERR, "Failed to enable error reporting\n"); 1227 1230 goto fail4; 1228 1231 } 1232 + 1233 + /* Check if any pending errors before/during the registration of the error handler */ 1234 + ecclog_handler(); 1229 1235 1230 1236 igen6_debug_setup(); 1231 1237 return 0;