summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
diff options
context:
space:
mode:
authorDave Airlie <airlied@redhat.com>2025-11-18 06:58:01 +1000
committerDave Airlie <airlied@redhat.com>2025-11-18 07:01:26 +1000
commitf3a1d69f9b388271986f4efe1fd775df15b443c1 (patch)
treeb82a68c63a88b24b30ede3b4263fb3d2f2b6344d /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
parentfd1a11ea111b083aa3d19f36516ecda5efa2b69f (diff)
parentccd3b4c7c37fbbd3e5244d3c54ca24ae0a37810d (diff)
Merge tag 'amd-drm-next-6.19-2025-11-14' of https://gitlab.freedesktop.org/agd5f/linux into drm-next
amd-drm-next-6.19-2025-11-14: amdgpu: - RAS updates - GC12 DCC P2P fix - Documentation fixes - Power limit code cleanup - Userq updates - VRR fix - SMART OLED support - DSC refactor for DCN 3.5 - Replay updates - DC clockgating updates - HDCP refactor - ISP fix - SMU 13.0.12 updates - JPEG 5.0.1 fix - VCE1 support - Enable DC by default on SI - Refactor CIK and SI enablement - Enable amdgpu by default for CI dGPUs - XGMI fixes - SR-IOV fixes - Memory allocation critical path fixes - Enable amdgpu by default on SI dGPUs amdkfd: - Relax checks on save area overallocations - Fix GPU mappings after prefetch radeon: - Refactor CIK and SI enablement Signed-off-by: Dave Airlie <airlied@redhat.com> From: Alex Deucher <alexander.deucher@amd.com> Link: https://patch.msgid.link/20251114192553.442621-1-alexander.deucher@amd.com
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c23
1 files changed, 19 insertions, 4 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 055a9bbabbdb..9e2e098af86c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3014,8 +3014,13 @@ static int amdgpu_ras_mca2pa_by_idx(struct amdgpu_device *adev,
addr_in.ma.err_addr = bps->address;
addr_in.ma.socket_id = socket;
addr_in.ma.ch_inst = bps->mem_channel;
- /* tell RAS TA the node instance is not used */
- addr_in.ma.node_inst = TA_RAS_INV_NODE;
+ if (!amdgpu_ras_smu_eeprom_supported(adev)) {
+ /* tell RAS TA the node instance is not used */
+ addr_in.ma.node_inst = TA_RAS_INV_NODE;
+ } else {
+ addr_in.ma.umc_inst = bps->mcumc_id;
+ addr_in.ma.node_inst = bps->cu;
+ }
if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr)
ret = adev->umc.ras->convert_ras_err_addr(adev, err_data,
@@ -3162,7 +3167,11 @@ static int __amdgpu_ras_convert_rec_from_rom(struct amdgpu_device *adev,
save_nps = (bps->retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK;
bps->retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT);
} else {
- save_nps = nps;
+ /* if pmfw manages eeprom, save_nps is not stored on eeprom,
+ * we should always convert mca address into physical address,
+ * make save_nps different from nps
+ */
+ save_nps = nps + 1;
}
if (save_nps == nps) {
@@ -3300,7 +3309,13 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
mutex_lock(&con->recovery_lock);
control = &con->eeprom_control;
data = con->eh_data;
- unit_num = data->count / adev->umc.retire_unit - control->ras_num_recs;
+ if (amdgpu_ras_smu_eeprom_supported(adev))
+ unit_num = control->ras_num_recs -
+ control->ras_num_recs_old;
+ else
+ unit_num = data->count / adev->umc.retire_unit -
+ control->ras_num_recs;
+
save_count = con->bad_page_num - control->ras_num_bad_pages;
mutex_unlock(&con->recovery_lock);