diff options
| author | Shiju Jose <shiju.jose@huawei.com> | 2025-02-12 14:36:42 +0000 |
|---|---|---|
| committer | Borislav Petkov (AMD) <bp@alien8.de> | 2025-02-26 11:13:23 +0100 |
| commit | 699ea5219c4b1d9d8819eb2d99e51a3fdb7b1d7b (patch) | |
| tree | b77074ec0ffac02fb59c799a179a10983fa192b5 /drivers/edac/edac_device.c | |
| parent | bcbd069b11b024994e30c7c2f3d716a4141fdab1 (diff) | |
EDAC: Add a memory repair control feature
Add a generic EDAC memory repair control driver to manage memory repairs in
the system, such as CXL Post Package Repair (PPR) and other soft and hard PPR
features.
For example, a CXL device with DRAM components that support PPR features may
implement PPR maintenance operations. DRAM components may support two types of
PPR:
- hard PPR, for a permanent row repair, and
- soft PPR, for a temporary row repair.
Soft PPR is much faster than hard PPR, but the repair is lost with a power
cycle.
When a CXL device detects an error in a memory, it may report the need for
a repair maintenance operation by using an event record where the "maintenance
needed" flag is set. The event records contain the device physical
address (DPA) and other optional attributes of the memory to repair.
The kernel will report the corresponding CXL general media or DRAM trace event
to userspace, and userspace tools (e.g. rasdaemon) will initiate a repair
operation in response to the device request via the sysfs repair control.
Device with memory repair features registers with EDAC device driver, which
retrieves a memory repair descriptor from EDAC memory repair driver and exposes
the sysfs repair control attributes to userspace in
/sys/bus/edac/devices/<dev-name>/mem_repairX/.
The common memory repair control interface abstracts the control of arbitrary
memory repair functionality into a standardized set of functions. The sysfs
memory repair attribute nodes are only available if the client driver has
implemented the corresponding attribute callback function and provided
operations to the EDAC device driver during registration.
[ bp: Massage, fixup edac_dev_register() retvals, merge
write_overflow fix to mem_repair_create_desc() ]
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20250212143654.1893-5-shiju.jose@huawei.com
Diffstat (limited to 'drivers/edac/edac_device.c')
| -rw-r--r-- | drivers/edac/edac_device.c | 33 |
1 files changed, 33 insertions, 0 deletions
diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c index b914ca3fc5e5..16611515ab34 100644 --- a/drivers/edac/edac_device.c +++ b/drivers/edac/edac_device.c @@ -575,6 +575,7 @@ static void edac_dev_release(struct device *dev) { struct edac_dev_feat_ctx *ctx = container_of(dev, struct edac_dev_feat_ctx, dev); + kfree(ctx->mem_repair); kfree(ctx->scrub); kfree(ctx->dev.groups); kfree(ctx); @@ -613,6 +614,7 @@ int edac_dev_register(struct device *parent, char *name, const struct attribute_group **ras_attr_groups; struct edac_dev_data *dev_data; struct edac_dev_feat_ctx *ctx; + int mem_repair_cnt = 0; int attr_gcnt = 0; int ret = -ENOMEM; int scrub_cnt = 0; @@ -631,6 +633,10 @@ int edac_dev_register(struct device *parent, char *name, case RAS_FEAT_ECS: attr_gcnt += ras_features[feat].ecs_info.num_media_frus; break; + case RAS_FEAT_MEM_REPAIR: + attr_gcnt++; + mem_repair_cnt++; + break; default: return -EINVAL; } @@ -650,8 +656,15 @@ int edac_dev_register(struct device *parent, char *name, goto groups_free; } + if (mem_repair_cnt) { + ctx->mem_repair = kcalloc(mem_repair_cnt, sizeof(*ctx->mem_repair), GFP_KERNEL); + if (!ctx->mem_repair) + goto data_mem_free; + } + attr_gcnt = 0; scrub_cnt = 0; + mem_repair_cnt = 0; for (feat = 0; feat < num_features; feat++, ras_features++) { switch (ras_features->ft_type) { case RAS_FEAT_SCRUB: @@ -688,6 +701,25 @@ int edac_dev_register(struct device *parent, char *name, attr_gcnt += ras_features->ecs_info.num_media_frus; break; + case RAS_FEAT_MEM_REPAIR: + if (!ras_features->mem_repair_ops || + mem_repair_cnt != ras_features->instance) { + ret = -EINVAL; + goto data_mem_free; + } + + dev_data = &ctx->mem_repair[mem_repair_cnt]; + dev_data->instance = mem_repair_cnt; + dev_data->mem_repair_ops = ras_features->mem_repair_ops; + dev_data->private = ras_features->ctx; + ret = edac_mem_repair_get_desc(parent, &ras_attr_groups[attr_gcnt], + ras_features->instance); + if (ret) + goto data_mem_free; + + mem_repair_cnt++; + attr_gcnt++; + break; default: ret = -EINVAL; goto data_mem_free; @@ -714,6 +746,7 @@ int edac_dev_register(struct device *parent, char *name, return devm_add_action_or_reset(parent, edac_dev_unreg, &ctx->dev); data_mem_free: + kfree(ctx->mem_repair); kfree(ctx->scrub); groups_free: kfree(ras_attr_groups); |
