Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

EDAC: Add support for EDAC device features control

Add generic EDAC device feature controls supporting the registration of RAS
features available in the system. The driver exposes control attributes for
these features to userspace in

/sys/bus/edac/devices/<dev-name>/<ras-feature>

[ bp: Touch-up documentation, simplify, make edac_dev_type static,
fixup edac_dev_register() retvals. ]

Co-developed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Fan Ni <fan.ni@samsung.com>
Tested-by: Daniel Ferguson <danielf@os.amperecomputing.com>
Tested-by: Fan Ni <fan.ni@samsung.com>
Link: https://lore.kernel.org/r/20250212143654.1893-2-shiju.jose@huawei.com

authored by

Shiju Jose and committed by
Borislav Petkov (AMD)
db99ea5f 0ad2507d

+230
+93
Documentation/edac/features.rst
··· 1 + .. SPDX-License-Identifier: GPL-2.0 OR GFDL-1.2-no-invariants-or-later 2 + 3 + ================= 4 + EDAC/RAS features 5 + ================= 6 + 7 + Copyright (c) 2024-2025 HiSilicon Limited. 8 + 9 + :Author: Shiju Jose <shiju.jose@huawei.com> 10 + :License: The GNU Free Documentation License, Version 1.2 without 11 + Invariant Sections, Front-Cover Texts nor Back-Cover Texts. 12 + (dual licensed under the GPL v2) 13 + 14 + - Written for: 6.15 15 + 16 + Introduction 17 + ------------ 18 + 19 + EDAC/RAS components plugging and high-level design: 20 + 21 + 1. Scrub control 22 + 23 + 2. Error Check Scrub (ECS) control 24 + 25 + 3. ACPI RAS2 features 26 + 27 + 4. Post Package Repair (PPR) control 28 + 29 + 5. Memory Sparing Repair control 30 + 31 + High level design is illustrated in the following diagram:: 32 + 33 + +-----------------------------------------------+ 34 + | Userspace - Rasdaemon | 35 + | +-------------+ | 36 + | | RAS CXL mem | +---------------+ | 37 + | |error handler|---->| | | 38 + | +-------------+ | RAS dynamic | | 39 + | +-------------+ | scrub, memory | | 40 + | | RAS memory |---->| repair control| | 41 + | |error handler| +----|----------+ | 42 + | +-------------+ | | 43 + +--------------------------|--------------------+ 44 + | 45 + | 46 + +-------------------------------|------------------------------+ 47 + | Kernel EDAC extension for | controlling RAS Features | 48 + |+------------------------------|----------------------------+ | 49 + || EDAC Core Sysfs EDAC| Bus | | 50 + || +--------------------------|---------------------------+| | 51 + || |/sys/bus/edac/devices/<dev>/scrubX/ | | EDAC device || | 52 + || |/sys/bus/edac/devices/<dev>/ecsX/ |<->| EDAC MC || | 53 + || |/sys/bus/edac/devices/<dev>/repairX | | EDAC sysfs || | 54 + || +---------------------------|--------------------------+| | 55 + || EDAC|Bus | | 56 + || | | | 57 + || +----------+ Get feature | Get feature | | 58 + || | | desc +---------|------+ desc +----------+ | | 59 + || |EDAC scrub|<-----| EDAC device | | | | | 60 + || +----------+ | driver- RAS |----->| EDAC mem | | | 61 + || +----------+ | feature control| | repair | | | 62 + || | |<-----| | +----------+ | | 63 + || |EDAC ECS | +---------|------+ | | 64 + || +----------+ Register RAS|features | | 65 + || ______________________|_____________ | | 66 + |+---------|---------------|------------------|--------------+ | 67 + | +-------|----+ +-------|-------+ +----|----------+ | 68 + | | | | CXL mem driver| | Client driver | | 69 + | | ACPI RAS2 | | scrub, ECS, | | memory repair | | 70 + | | driver | | sparing, PPR | | features | | 71 + | +-----|------+ +-------|-------+ +------|--------+ | 72 + | | | | | 73 + +--------|-----------------|--------------------|--------------+ 74 + | | | 75 + +--------|-----------------|--------------------|--------------+ 76 + | +---|-----------------|--------------------|-------+ | 77 + | | | | 78 + | | Platform HW and Firmware | | 79 + | +--------------------------------------------------+ | 80 + +--------------------------------------------------------------+ 81 + 82 + 83 + 1. EDAC Features components - Create feature-specific descriptors. For 84 + example: scrub, ECS, memory repair in the above diagram. 85 + 86 + 2. EDAC device driver for controlling RAS Features - Get feature's attribute 87 + descriptors from EDAC RAS feature component and registers device's RAS 88 + features with EDAC bus and expose the features control attributes via 89 + sysfs. For example, /sys/bus/edac/devices/<dev-name>/<feature>X/ 90 + 91 + 3. RAS dynamic feature controller - Userspace sample modules in rasdaemon for 92 + dynamic scrub/repair control to issue scrubbing/repair when excess number 93 + of corrected memory errors are reported in a short span of time.
+10
Documentation/edac/index.rst
··· 1 + .. SPDX-License-Identifier: GPL-2.0 OR GFDL-1.2-no-invariants-or-later 2 + 3 + ============== 4 + EDAC Subsystem 5 + ============== 6 + 7 + .. toctree:: 8 + :maxdepth: 1 9 + 10 + features
+101
drivers/edac/edac_device.c
··· 570 570 block ? block->name : "N/A", count, msg); 571 571 } 572 572 EXPORT_SYMBOL_GPL(edac_device_handle_ue_count); 573 + 574 + static void edac_dev_release(struct device *dev) 575 + { 576 + struct edac_dev_feat_ctx *ctx = container_of(dev, struct edac_dev_feat_ctx, dev); 577 + 578 + kfree(ctx->dev.groups); 579 + kfree(ctx); 580 + } 581 + 582 + static const struct device_type edac_dev_type = { 583 + .name = "edac_dev", 584 + .release = edac_dev_release, 585 + }; 586 + 587 + static void edac_dev_unreg(void *data) 588 + { 589 + device_unregister(data); 590 + } 591 + 592 + /** 593 + * edac_dev_register - register device for RAS features with EDAC 594 + * @parent: parent device. 595 + * @name: name for the folder in the /sys/bus/edac/devices/, 596 + * which is derived from the parent device. 597 + * For e.g. /sys/bus/edac/devices/cxl_mem0/ 598 + * @private: parent driver's data to store in the context if any. 599 + * @num_features: number of RAS features to register. 600 + * @ras_features: list of RAS features to register. 601 + * 602 + * Return: 603 + * * %0 - Success. 604 + * * %-EINVAL - Invalid parameters passed. 605 + * * %-ENOMEM - Dynamic memory allocation failed. 606 + * 607 + */ 608 + int edac_dev_register(struct device *parent, char *name, 609 + void *private, int num_features, 610 + const struct edac_dev_feature *ras_features) 611 + { 612 + const struct attribute_group **ras_attr_groups; 613 + struct edac_dev_feat_ctx *ctx; 614 + int attr_gcnt = 0; 615 + int ret = -ENOMEM; 616 + int feat; 617 + 618 + if (!parent || !name || !num_features || !ras_features) 619 + return -EINVAL; 620 + 621 + /* Double parse to make space for attributes */ 622 + for (feat = 0; feat < num_features; feat++) { 623 + switch (ras_features[feat].ft_type) { 624 + /* Add feature specific code */ 625 + default: 626 + return -EINVAL; 627 + } 628 + } 629 + 630 + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); 631 + if (!ctx) 632 + return -ENOMEM; 633 + 634 + ras_attr_groups = kcalloc(attr_gcnt + 1, sizeof(*ras_attr_groups), GFP_KERNEL); 635 + if (!ras_attr_groups) 636 + goto ctx_free; 637 + 638 + attr_gcnt = 0; 639 + for (feat = 0; feat < num_features; feat++, ras_features++) { 640 + switch (ras_features->ft_type) { 641 + /* Add feature specific code */ 642 + default: 643 + ret = -EINVAL; 644 + goto groups_free; 645 + } 646 + } 647 + 648 + ctx->dev.parent = parent; 649 + ctx->dev.bus = edac_get_sysfs_subsys(); 650 + ctx->dev.type = &edac_dev_type; 651 + ctx->dev.groups = ras_attr_groups; 652 + ctx->private = private; 653 + dev_set_drvdata(&ctx->dev, ctx); 654 + 655 + ret = dev_set_name(&ctx->dev, name); 656 + if (ret) 657 + goto groups_free; 658 + 659 + ret = device_register(&ctx->dev); 660 + if (ret) { 661 + put_device(&ctx->dev); 662 + return ret; 663 + } 664 + 665 + return devm_add_action_or_reset(parent, edac_dev_unreg, &ctx->dev); 666 + 667 + groups_free: 668 + kfree(ras_attr_groups); 669 + ctx_free: 670 + kfree(ctx); 671 + return ret; 672 + } 673 + EXPORT_SYMBOL_GPL(edac_dev_register);
+26
include/linux/edac.h
··· 661 661 662 662 return mci->dimms[index]; 663 663 } 664 + 665 + /* RAS feature type */ 666 + enum edac_dev_feat { 667 + RAS_FEAT_MAX 668 + }; 669 + 670 + /* EDAC device feature information structure */ 671 + struct edac_dev_data { 672 + u8 instance; 673 + void *private; 674 + }; 675 + 676 + struct edac_dev_feat_ctx { 677 + struct device dev; 678 + void *private; 679 + }; 680 + 681 + struct edac_dev_feature { 682 + enum edac_dev_feat ft_type; 683 + u8 instance; 684 + void *ctx; 685 + }; 686 + 687 + int edac_dev_register(struct device *parent, char *dev_name, 688 + void *parent_pvt_data, int num_features, 689 + const struct edac_dev_feature *ras_features); 664 690 #endif /* _LINUX_EDAC_H_ */