summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
diff options
context:
space:
mode:
authorTao Zhou <tao.zhou1@amd.com>2025-11-06 16:26:56 +0800
committerAlex Deucher <alexander.deucher@amd.com>2025-11-11 21:54:14 -0500
commit7fb41ab3c94828ad48e1a6d2237e8a7e682c74b9 (patch)
tree6fbfe0f157e779fdb3b0097fce4171562a6cec24 /drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
parenteed30152746ec1d8b6e8ab31e349f1eb8d8bd666 (diff)
drm/amdgpu: optimize timeout implemention in ras_eeprom_update_record_num
The busy status returned by ras_eeprom_update_record_num may not be an error, increase timeout to exclude false busy status. Also add more comments to make the code readable. v2: define a macro for the timeout value. Signed-off-by: Tao Zhou <tao.zhou1@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c18
1 files changed, 13 insertions, 5 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 99aa1908833d..64dd7a81bff5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -124,6 +124,8 @@
RAS_TABLE_V2_1_INFO_SIZE) \
/ RAS_TABLE_RECORD_SIZE)
+#define RAS_SMU_MESSAGE_TIMEOUT_MS 1000 /* 1s */
+
/* Given a zero-based index of an EEPROM RAS record, yields the EEPROM
* offset off of RAS_TABLE_START. That is, this is something you can
* add to control->i2c_address, and then tell I2C layer to read
@@ -874,7 +876,7 @@ Out:
int amdgpu_ras_eeprom_update_record_num(struct amdgpu_ras_eeprom_control *control)
{
struct amdgpu_device *adev = to_amdgpu_device(control);
- int ret, timeout = 1000;
+ int ret, retry = 20;
if (!amdgpu_ras_smu_eeprom_supported(adev))
return 0;
@@ -882,17 +884,23 @@ int amdgpu_ras_eeprom_update_record_num(struct amdgpu_ras_eeprom_control *contro
control->ras_num_recs_old = control->ras_num_recs;
do {
+ /* 1000ms timeout is long enough, smu_get_badpage_count won't
+ * return -EBUSY before timeout.
+ */
ret = amdgpu_ras_smu_get_badpage_count(adev,
- &(control->ras_num_recs), 12);
+ &(control->ras_num_recs), RAS_SMU_MESSAGE_TIMEOUT_MS);
if (!ret &&
(control->ras_num_recs_old == control->ras_num_recs)) {
- /* record number update in PMFW needs some time */
+ /* record number update in PMFW needs some time,
+ * smu_get_badpage_count may return immediately without
+ * count update, sleep for a while and retry again.
+ */
msleep(50);
- timeout -= 50;
+ retry--;
} else {
break;
}
- } while (timeout);
+ } while (retry);
/* no update of record number is not a real failure,
* don't print warning here