Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

powerpc/papr_scm: Implement initial support for injecting smart errors

Presently PAPR doesn't support injecting smart errors on an
NVDIMM. This makes testing the NVDIMM health reporting functionality
difficult as simulating NVDIMM health related events need a hacked up
qemu version.

To solve this problem this patch proposes simulating certain set of
NVDIMM health related events in papr_scm. Specifically 'fatal' health
state and 'dirty' shutdown state. These error can be injected via the
user-space 'ndctl-inject-smart(1)' command. With the proposed patch and
corresponding ndctl patches following command flow is expected:

$ sudo ndctl list -DH -d nmem0
...
"health_state":"ok",
"shutdown_state":"clean",
...
# inject unsafe shutdown and fatal health error
$ sudo ndctl inject-smart nmem0 -Uf
...
"health_state":"fatal",
"shutdown_state":"dirty",
...
# uninject all errors
$ sudo ndctl inject-smart nmem0 -N
...
"health_state":"ok",
"shutdown_state":"clean",
...

The patch adds a new member 'health_bitmap_inject_mask' inside struct
papr_scm_priv which is then bitwise ANDed to the health bitmap fetched from the
hypervisor. The value for 'health_bitmap_inject_mask' is accessible from sysfs
at nmemX/papr/health_bitmap_inject.

A new PDSM named 'SMART_INJECT' is proposed that accepts newly
introduced 'struct nd_papr_pdsm_smart_inject' as payload thats
exchanged between libndctl and papr_scm to indicate the requested
smart-error states.

When the processing the PDSM 'SMART_INJECT', papr_pdsm_smart_inject()
constructs a pair or 'inject_mask' and 'clear_mask' bitmaps from the payload
and bit-blt it to the 'health_bitmap_inject_mask'. This ensures the after being
fetched from the hypervisor, the health_bitmap reflects requested smart-error
states.

Signed-off-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Signed-off-by: Shivaprasad G Bhat <sbhat@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20220124202204.1488346-1-vaibhav@linux.ibm.com

authored by

Vaibhav Jain and committed by
Michael Ellerman
bbbca723 76b37281

+117 -3
+12
Documentation/ABI/testing/sysfs-bus-papr-pmem
··· 61 61 * "CchRHCnt" : Cache Read Hit Count 62 62 * "CchWHCnt" : Cache Write Hit Count 63 63 * "FastWCnt" : Fast Write Count 64 + 65 + What: /sys/bus/nd/devices/nmemX/papr/health_bitmap_inject 66 + Date: Jan, 2022 67 + KernelVersion: v5.17 68 + Contact: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>, nvdimm@lists.linux.dev, 69 + Description: 70 + (RO) Reports the health bitmap inject bitmap that is applied to 71 + bitmap received from PowerVM via the H_SCM_HEALTH. This is used 72 + to forcibly set specific bits returned from Hcall. These is then 73 + used to simulate various health or shutdown states for an nvdimm 74 + and are set by user-space tools like ndctl by issuing a PAPR DSM. 75 +
+18
arch/powerpc/include/uapi/asm/papr_pdsm.h
··· 116 116 }; 117 117 }; 118 118 119 + /* Flags for injecting specific smart errors */ 120 + #define PDSM_SMART_INJECT_HEALTH_FATAL (1 << 0) 121 + #define PDSM_SMART_INJECT_BAD_SHUTDOWN (1 << 1) 122 + 123 + struct nd_papr_pdsm_smart_inject { 124 + union { 125 + struct { 126 + /* One or more of PDSM_SMART_INJECT_ */ 127 + __u32 flags; 128 + __u8 fatal_enable; 129 + __u8 unsafe_shutdown_enable; 130 + }; 131 + __u8 buf[ND_PDSM_PAYLOAD_MAX_SIZE]; 132 + }; 133 + }; 134 + 119 135 /* 120 136 * Methods to be embedded in ND_CMD_CALL request. These are sent to the kernel 121 137 * via 'nd_cmd_pkg.nd_command' member of the ioctl struct ··· 139 123 enum papr_pdsm { 140 124 PAPR_PDSM_MIN = 0x0, 141 125 PAPR_PDSM_HEALTH, 126 + PAPR_PDSM_SMART_INJECT, 142 127 PAPR_PDSM_MAX, 143 128 }; 144 129 145 130 /* Maximal union that can hold all possible payload types */ 146 131 union nd_pdsm_payload { 147 132 struct nd_papr_pdsm_health health; 133 + struct nd_papr_pdsm_smart_inject smart_inject; 148 134 __u8 buf[ND_PDSM_PAYLOAD_MAX_SIZE]; 149 135 } __packed; 150 136
+87 -3
arch/powerpc/platforms/pseries/papr_scm.c
··· 120 120 121 121 /* length of the stat buffer as expected by phyp */ 122 122 size_t stat_buffer_len; 123 + 124 + /* The bits which needs to be overridden */ 125 + u64 health_bitmap_inject_mask; 126 + 123 127 }; 124 128 125 129 static int papr_scm_pmem_flush(struct nd_region *nd_region, ··· 351 347 static int __drc_pmem_query_health(struct papr_scm_priv *p) 352 348 { 353 349 unsigned long ret[PLPAR_HCALL_BUFSIZE]; 350 + u64 bitmap = 0; 354 351 long rc; 355 352 356 353 /* issue the hcall */ 357 354 rc = plpar_hcall(H_SCM_HEALTH, ret, p->drc_index); 358 - if (rc != H_SUCCESS) { 355 + if (rc == H_SUCCESS) 356 + bitmap = ret[0] & ret[1]; 357 + else if (rc == H_FUNCTION) 358 + dev_info_once(&p->pdev->dev, 359 + "Hcall H_SCM_HEALTH not implemented, assuming empty health bitmap"); 360 + else { 361 + 359 362 dev_err(&p->pdev->dev, 360 363 "Failed to query health information, Err:%ld\n", rc); 361 364 return -ENXIO; 362 365 } 363 366 364 367 p->lasthealth_jiffies = jiffies; 365 - p->health_bitmap = ret[0] & ret[1]; 366 - 368 + /* Allow injecting specific health bits via inject mask. */ 369 + if (p->health_bitmap_inject_mask) 370 + bitmap = (bitmap & ~p->health_bitmap_inject_mask) | 371 + p->health_bitmap_inject_mask; 372 + WRITE_ONCE(p->health_bitmap, bitmap); 367 373 dev_dbg(&p->pdev->dev, 368 374 "Queried dimm health info. Bitmap:0x%016lx Mask:0x%016lx\n", 369 375 ret[0], ret[1]); ··· 683 669 return rc; 684 670 } 685 671 672 + /* Inject a smart error Add the dirty-shutdown-counter value to the pdsm */ 673 + static int papr_pdsm_smart_inject(struct papr_scm_priv *p, 674 + union nd_pdsm_payload *payload) 675 + { 676 + int rc; 677 + u32 supported_flags = 0; 678 + u64 inject_mask = 0, clear_mask = 0; 679 + u64 mask; 680 + 681 + /* Check for individual smart error flags and update inject/clear masks */ 682 + if (payload->smart_inject.flags & PDSM_SMART_INJECT_HEALTH_FATAL) { 683 + supported_flags |= PDSM_SMART_INJECT_HEALTH_FATAL; 684 + if (payload->smart_inject.fatal_enable) 685 + inject_mask |= PAPR_PMEM_HEALTH_FATAL; 686 + else 687 + clear_mask |= PAPR_PMEM_HEALTH_FATAL; 688 + } 689 + 690 + if (payload->smart_inject.flags & PDSM_SMART_INJECT_BAD_SHUTDOWN) { 691 + supported_flags |= PDSM_SMART_INJECT_BAD_SHUTDOWN; 692 + if (payload->smart_inject.unsafe_shutdown_enable) 693 + inject_mask |= PAPR_PMEM_SHUTDOWN_DIRTY; 694 + else 695 + clear_mask |= PAPR_PMEM_SHUTDOWN_DIRTY; 696 + } 697 + 698 + dev_dbg(&p->pdev->dev, "[Smart-inject] inject_mask=%#llx clear_mask=%#llx\n", 699 + inject_mask, clear_mask); 700 + 701 + /* Prevent concurrent access to dimm health bitmap related members */ 702 + rc = mutex_lock_interruptible(&p->health_mutex); 703 + if (rc) 704 + return rc; 705 + 706 + /* Use inject/clear masks to set health_bitmap_inject_mask */ 707 + mask = READ_ONCE(p->health_bitmap_inject_mask); 708 + mask = (mask & ~clear_mask) | inject_mask; 709 + WRITE_ONCE(p->health_bitmap_inject_mask, mask); 710 + 711 + /* Invalidate cached health bitmap */ 712 + p->lasthealth_jiffies = 0; 713 + 714 + mutex_unlock(&p->health_mutex); 715 + 716 + /* Return the supported flags back to userspace */ 717 + payload->smart_inject.flags = supported_flags; 718 + 719 + return sizeof(struct nd_papr_pdsm_health); 720 + } 721 + 686 722 /* 687 723 * 'struct pdsm_cmd_desc' 688 724 * Identifies supported PDSMs' expected length of in/out payloads ··· 765 701 .size_in = 0, 766 702 .size_out = sizeof(struct nd_papr_pdsm_health), 767 703 .service = papr_pdsm_health, 704 + }, 705 + 706 + [PAPR_PDSM_SMART_INJECT] = { 707 + .size_in = sizeof(struct nd_papr_pdsm_smart_inject), 708 + .size_out = sizeof(struct nd_papr_pdsm_smart_inject), 709 + .service = papr_pdsm_smart_inject, 768 710 }, 769 711 /* Empty */ 770 712 [PAPR_PDSM_MAX] = { ··· 908 838 return 0; 909 839 } 910 840 841 + static ssize_t health_bitmap_inject_show(struct device *dev, 842 + struct device_attribute *attr, 843 + char *buf) 844 + { 845 + struct nvdimm *dimm = to_nvdimm(dev); 846 + struct papr_scm_priv *p = nvdimm_provider_data(dimm); 847 + 848 + return sprintf(buf, "%#llx\n", 849 + READ_ONCE(p->health_bitmap_inject_mask)); 850 + } 851 + 852 + static DEVICE_ATTR_ADMIN_RO(health_bitmap_inject); 853 + 911 854 static ssize_t perf_stats_show(struct device *dev, 912 855 struct device_attribute *attr, char *buf) 913 856 { ··· 1035 952 &dev_attr_flags.attr, 1036 953 &dev_attr_perf_stats.attr, 1037 954 &dev_attr_dirty_shutdown.attr, 955 + &dev_attr_health_bitmap_inject.attr, 1038 956 NULL, 1039 957 }; 1040 958