Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

nfit: do an ARS scrub on hitting a latent media error

When a latent (unknown to 'badblocks') error is encountered, it will
trigger a machine check exception. On a system with machine check
recovery, this will only SIGBUS the process(es) which had the bad page
mapped (as opposed to a kernel panic on platforms without machine
check recovery features). In the former case, we want to trigger a full
rescan of that nvdimm bus. This will allow any additional, new errors
to be captured in the block devices' badblocks lists, and offending
operations on them can be trapped early, avoiding machine checks.

This is done by registering a callback function with the
x86_mce_decoder_chain and calling the new ars_rescan functionality with
the address in the mce notificatiion.

Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

authored by

Vishal Verma and committed by
Dan Williams
6839a6d9 bdf97013

+133 -4
+1
drivers/acpi/nfit/Makefile
··· 1 1 obj-$(CONFIG_ACPI_NFIT) := nfit.o 2 2 nfit-y := core.o 3 + nfit-$(CONFIG_X86_MCE) += mce.o
+22 -4
drivers/acpi/nfit/core.c
··· 51 51 MODULE_PARM_DESC(disable_vendor_specific, 52 52 "Limit commands to the publicly specified set\n"); 53 53 54 + LIST_HEAD(acpi_descs); 55 + DEFINE_MUTEX(acpi_desc_lock); 56 + 54 57 static struct workqueue_struct *nfit_wq; 55 58 56 59 struct nfit_table_prev { ··· 364 361 return to_name[type]; 365 362 } 366 363 367 - static int nfit_spa_type(struct acpi_nfit_system_address *spa) 364 + int nfit_spa_type(struct acpi_nfit_system_address *spa) 368 365 { 369 366 int i; 370 367 ··· 900 897 device_unlock(dev); 901 898 return rc; 902 899 } 903 - 904 - static int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc); 905 900 906 901 static ssize_t scrub_store(struct device *dev, 907 902 struct device_attribute *attr, const char *buf, size_t size) ··· 2401 2400 struct acpi_nfit_desc *acpi_desc = data; 2402 2401 struct device *bus_dev = to_nvdimm_bus_dev(acpi_desc->nvdimm_bus); 2403 2402 2403 + /* 2404 + * Destruct under acpi_desc_lock so that nfit_handle_mce does not 2405 + * race teardown 2406 + */ 2407 + mutex_lock(&acpi_desc_lock); 2404 2408 acpi_desc->cancel = 1; 2405 2409 /* 2406 2410 * Bounce the nvdimm bus lock to make sure any in-flight ··· 2420 2414 sysfs_put(acpi_desc->scrub_count_state); 2421 2415 nvdimm_bus_unregister(acpi_desc->nvdimm_bus); 2422 2416 acpi_desc->nvdimm_bus = NULL; 2417 + list_del(&acpi_desc->list); 2418 + mutex_unlock(&acpi_desc_lock); 2423 2419 } 2424 2420 2425 2421 int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, void *data, acpi_size sz) ··· 2447 2439 rc = acpi_nfit_desc_init_scrub_attr(acpi_desc); 2448 2440 if (rc) 2449 2441 return rc; 2442 + 2443 + /* register this acpi_desc for mce notifications */ 2444 + mutex_lock(&acpi_desc_lock); 2445 + list_add_tail(&acpi_desc->list, &acpi_descs); 2446 + mutex_unlock(&acpi_desc_lock); 2450 2447 } 2451 2448 2452 2449 mutex_lock(&acpi_desc->init_mutex); ··· 2562 2549 return 0; 2563 2550 } 2564 2551 2565 - static int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc) 2552 + int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc) 2566 2553 { 2567 2554 struct device *dev = acpi_desc->dev; 2568 2555 struct nfit_spa *nfit_spa; ··· 2611 2598 INIT_LIST_HEAD(&acpi_desc->flushes); 2612 2599 INIT_LIST_HEAD(&acpi_desc->memdevs); 2613 2600 INIT_LIST_HEAD(&acpi_desc->dimms); 2601 + INIT_LIST_HEAD(&acpi_desc->list); 2614 2602 mutex_init(&acpi_desc->init_mutex); 2615 2603 INIT_WORK(&acpi_desc->work, acpi_nfit_scrub); 2616 2604 } ··· 2764 2750 if (!nfit_wq) 2765 2751 return -ENOMEM; 2766 2752 2753 + nfit_mce_register(); 2754 + 2767 2755 return acpi_bus_register_driver(&acpi_nfit_driver); 2768 2756 } 2769 2757 2770 2758 static __exit void nfit_exit(void) 2771 2759 { 2760 + nfit_mce_unregister(); 2772 2761 acpi_bus_unregister_driver(&acpi_nfit_driver); 2773 2762 destroy_workqueue(nfit_wq); 2763 + WARN_ON(!list_empty(&acpi_descs)); 2774 2764 } 2775 2765 2776 2766 module_init(nfit_init);
+89
drivers/acpi/nfit/mce.c
··· 1 + /* 2 + * NFIT - Machine Check Handler 3 + * 4 + * Copyright(c) 2013-2016 Intel Corporation. All rights reserved. 5 + * 6 + * This program is free software; you can redistribute it and/or modify 7 + * it under the terms of version 2 of the GNU General Public License as 8 + * published by the Free Software Foundation. 9 + * 10 + * This program is distributed in the hope that it will be useful, but 11 + * WITHOUT ANY WARRANTY; without even the implied warranty of 12 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 + * General Public License for more details. 14 + */ 15 + #include <linux/notifier.h> 16 + #include <linux/acpi.h> 17 + #include <asm/mce.h> 18 + #include "nfit.h" 19 + 20 + static int nfit_handle_mce(struct notifier_block *nb, unsigned long val, 21 + void *data) 22 + { 23 + struct mce *mce = (struct mce *)data; 24 + struct acpi_nfit_desc *acpi_desc; 25 + struct nfit_spa *nfit_spa; 26 + 27 + /* We only care about memory errors */ 28 + if (!(mce->status & MCACOD)) 29 + return NOTIFY_DONE; 30 + 31 + /* 32 + * mce->addr contains the physical addr accessed that caused the 33 + * machine check. We need to walk through the list of NFITs, and see 34 + * if any of them matches that address, and only then start a scrub. 35 + */ 36 + mutex_lock(&acpi_desc_lock); 37 + list_for_each_entry(acpi_desc, &acpi_descs, list) { 38 + struct device *dev = acpi_desc->dev; 39 + int found_match = 0; 40 + 41 + mutex_lock(&acpi_desc->init_mutex); 42 + list_for_each_entry(nfit_spa, &acpi_desc->spas, list) { 43 + struct acpi_nfit_system_address *spa = nfit_spa->spa; 44 + 45 + if (nfit_spa_type(spa) == NFIT_SPA_PM) 46 + continue; 47 + /* find the spa that covers the mce addr */ 48 + if (spa->address > mce->addr) 49 + continue; 50 + if ((spa->address + spa->length - 1) < mce->addr) 51 + continue; 52 + found_match = 1; 53 + dev_dbg(dev, "%s: addr in SPA %d (0x%llx, 0x%llx)\n", 54 + __func__, spa->range_index, spa->address, 55 + spa->length); 56 + /* 57 + * We can break at the first match because we're going 58 + * to rescan all the SPA ranges. There shouldn't be any 59 + * aliasing anyway. 60 + */ 61 + break; 62 + } 63 + mutex_unlock(&acpi_desc->init_mutex); 64 + 65 + /* 66 + * We can ignore an -EBUSY here because if an ARS is already 67 + * in progress, just let that be the last authoritative one 68 + */ 69 + if (found_match) 70 + acpi_nfit_ars_rescan(acpi_desc); 71 + } 72 + 73 + mutex_unlock(&acpi_desc_lock); 74 + return NOTIFY_DONE; 75 + } 76 + 77 + static struct notifier_block nfit_mce_dec = { 78 + .notifier_call = nfit_handle_mce, 79 + }; 80 + 81 + void nfit_mce_register(void) 82 + { 83 + mce_register_decode_chain(&nfit_mce_dec); 84 + } 85 + 86 + void nfit_mce_unregister(void) 87 + { 88 + mce_unregister_decode_chain(&nfit_mce_dec); 89 + }
+20
drivers/acpi/nfit/nfit.h
··· 16 16 #define __NFIT_H__ 17 17 #include <linux/workqueue.h> 18 18 #include <linux/libnvdimm.h> 19 + #include <linux/ndctl.h> 19 20 #include <linux/types.h> 20 21 #include <linux/uuid.h> 21 22 #include <linux/acpi.h> ··· 149 148 struct nd_cmd_ars_status *ars_status; 150 149 size_t ars_status_size; 151 150 struct work_struct work; 151 + struct list_head list; 152 152 struct kernfs_node *scrub_count_state; 153 153 unsigned int scrub_count; 154 154 unsigned int cancel:1; ··· 188 186 u64 cmd_offset; 189 187 u32 dimm_flags; 190 188 }; 189 + 190 + extern struct list_head acpi_descs; 191 + extern struct mutex acpi_desc_lock; 192 + int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc); 193 + 194 + #ifdef CONFIG_X86_MCE 195 + void nfit_mce_register(void); 196 + void nfit_mce_unregister(void); 197 + #else 198 + static inline void nfit_mce_register(void) 199 + { 200 + } 201 + static inline void nfit_mce_unregister(void) 202 + { 203 + } 204 + #endif 205 + 206 + int nfit_spa_type(struct acpi_nfit_system_address *spa); 191 207 192 208 static inline struct acpi_nfit_memory_map *__to_nfit_memdev( 193 209 struct nfit_mem *nfit_mem)
+1
tools/testing/nvdimm/Kbuild
··· 30 30 obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o 31 31 32 32 nfit-y := $(ACPI_SRC)/core.o 33 + nfit-$(CONFIG_X86_MCE) += $(ACPI_SRC)/mce.o 33 34 nfit-y += config_check.o 34 35 35 36 nd_pmem-y := $(NVDIMM_SRC)/pmem.o