summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/ras
diff options
context:
space:
mode:
authorGangliang Xie <ganglxie@amd.com>2025-12-15 15:18:35 +0800
committerAlex Deucher <alexander.deucher@amd.com>2026-03-04 11:42:20 -0500
commit72289903a231e421b8c0f50d8213ac2ad28ab8a4 (patch)
tree1fd5f0ee4ead391d9c0587c911bc9d6562963673 /drivers/gpu/drm/amd/ras
parent42c46be2ec30df732cea4d4682e8f70795f21cee (diff)
drm/amd/ras: adapt page retirement process for pmfw eeprom
read bad page data from pmfw eeprom when retirement is triggered, use timestamp read from eeprom Signed-off-by: Gangliang Xie <ganglxie@amd.com> Reviewed-by: Tao Zhou <tao.zhou1@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/ras')
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_aca.c31
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.c40
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.h2
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c3
4 files changed, 66 insertions, 10 deletions
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_aca.c b/drivers/gpu/drm/amd/ras/rascore/ras_aca.c
index e433c70d2989..67a35409ff0e 100644
--- a/drivers/gpu/drm/amd/ras/rascore/ras_aca.c
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_aca.c
@@ -234,16 +234,27 @@ static int aca_log_bad_bank(struct ras_core_context *ras_core,
bank_ecc->de_count) {
struct ras_bank_ecc ras_ecc = {0};
- ras_ecc.nps = ras_core_get_curr_nps_mode(ras_core);
- ras_ecc.addr = bank_ecc->bank_info.addr;
- ras_ecc.ipid = bank_ecc->bank_info.ipid;
- ras_ecc.status = bank_ecc->bank_info.status;
- ras_ecc.seq_no = bank->seq_no;
-
- if (ras_core_gpu_in_reset(ras_core))
- ras_umc_log_bad_bank_pending(ras_core, &ras_ecc);
- else
- ras_umc_log_bad_bank(ras_core, &ras_ecc);
+ if (ras_fw_eeprom_supported(ras_core)) {
+ ret = ras_fw_eeprom_update_record(ras_core, &ras_ecc);
+ if (!ret) {
+ ras_ecc.nps = ras_core_get_curr_nps_mode(ras_core);
+ ras_ecc.status = bank_ecc->bank_info.status;
+ ras_ecc.seq_no = bank->seq_no;
+ }
+ } else {
+ ras_ecc.nps = ras_core_get_curr_nps_mode(ras_core);
+ ras_ecc.addr = bank_ecc->bank_info.addr;
+ ras_ecc.ipid = bank_ecc->bank_info.ipid;
+ ras_ecc.status = bank_ecc->bank_info.status;
+ ras_ecc.seq_no = bank->seq_no;
+ }
+
+ if (!ret) {
+ if (ras_core_gpu_in_reset(ras_core))
+ ras_umc_log_bad_bank_pending(ras_core, &ras_ecc);
+ else
+ ras_umc_log_bad_bank(ras_core, &ras_ecc);
+ }
}
aca_report_ecc_info(ras_core,
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.c b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.c
index 79494ad16ee5..4a1b966d22fa 100644
--- a/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.c
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.c
@@ -24,6 +24,8 @@
#include "ras.h"
+#define RAS_SMU_MESSAGE_TIMEOUT_MS 1000 /* 1s */
+
void ras_fw_init_feature_flags(struct ras_core_context *ras_core)
{
struct ras_mp1 *mp1 = &ras_core->ras_mp1;
@@ -329,3 +331,41 @@ uint32_t ras_fw_eeprom_get_record_count(struct ras_core_context *ras_core)
return ras_core->ras_fw_eeprom.ras_num_recs;
}
+
+int ras_fw_eeprom_update_record(struct ras_core_context *ras_core,
+ struct ras_bank_ecc *ras_ecc)
+{
+ struct ras_fw_eeprom_control *control = &ras_core->ras_fw_eeprom;
+ int ret, retry = 20;
+ u32 recs_num_new = control->ras_num_recs;
+
+ do {
+ /* 1000ms timeout is long enough, smu_get_badpage_count won't
+ * return -EBUSY before timeout.
+ */
+ ret = ras_fw_get_badpage_count(ras_core,
+ &recs_num_new, RAS_SMU_MESSAGE_TIMEOUT_MS);
+ if (!ret &&
+ (recs_num_new == control->ras_num_recs)) {
+ /* record number update in PMFW needs some time,
+ * smu_get_badpage_count may return immediately without
+ * count update, sleep for a while and retry again.
+ */
+ msleep(50);
+ retry--;
+ } else {
+ break;
+ }
+ } while (retry);
+
+ if (ret)
+ return ret;
+
+ if (recs_num_new > control->ras_num_recs)
+ ret = ras_fw_eeprom_read_idx(ras_core, 0,
+ ras_ecc, control->ras_num_recs, 1);
+ else
+ ret = -EINVAL;
+
+ return ret;
+}
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.h b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.h
index 353977a2371e..18d6548e2151 100644
--- a/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.h
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.h
@@ -75,5 +75,7 @@ int ras_fw_eeprom_read_idx(struct ras_core_context *ras_core,
struct ras_bank_ecc *ras_ecc,
u32 rec_idx, const u32 num);
uint32_t ras_fw_eeprom_get_record_count(struct ras_core_context *ras_core);
+int ras_fw_eeprom_update_record(struct ras_core_context *ras_core,
+ struct ras_bank_ecc *ras_ecc);
#endif
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c b/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c
index 53dc59e4de0c..b809a2f21d73 100644
--- a/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c
@@ -373,6 +373,9 @@ static int umc_v12_0_bank_to_eeprom_record(struct ras_core_context *ras_core,
ACA_ADDR_2_ERR_ADDR(bank->addr), ACA_IPID_2_UMC_INST(bank->ipid),
&nps_addr, bank->nps, record);
+ if (ras_fw_eeprom_supported(ras_core) && bank->ts)
+ record->ts = bank->ts;
+
lookup_bad_pages_in_a_row(ras_core, record,
bank->nps, NULL, 0, bank->seq_no, true);