summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLijo Lazar <lijo.lazar@amd.com>2025-06-28 10:50:25 +0530
committerAlex Deucher <alexander.deucher@amd.com>2025-08-11 11:12:53 -0400
commit1dd2fa0e00f17b909ff0fab92c1c48c2d7911c15 (patch)
tree61a12ab77d5b2077cd56846718df592e999ac32a
parent111821e4b5a3105c42c7c99f4abd4d8af9f64248 (diff)
drm/amdgpu: Save and restore switch state
During a DPC error kernel waits for the link to be active before notifying downstream devices. On certain platforms with Broadcom switch in synthetiic mode, switch responds with values even though the link is not fully ready. The config space restoration done by pcie port driver for SWUS/DS of dGPU is thus not effective as the switch is still doing internal enumeration. As a workaround, save state of SWUS/DS device in driver. Add additional check to see if link is active and restore the values during DPC error callbacks. Signed-off-by: Lijo Lazar <lijo.lazar@amd.com> Reviewed-by: Yang Wang <kevinyang.wang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu.h3
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c85
2 files changed, 83 insertions, 5 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index c2771e32d0bf..ddd472e56f69 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -910,6 +910,9 @@ struct amdgpu_pcie_reset_ctx {
bool in_link_reset;
bool occurs_dpc;
bool audio_suspended;
+ struct pci_dev *swus;
+ struct pci_saved_state *swus_pcistate;
+ struct pci_saved_state *swds_pcistate;
};
/*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 5eed5c500190..9c373bdadb39 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -178,6 +178,8 @@ struct amdgpu_init_level amdgpu_init_minimal_xgmi = {
BIT(AMD_IP_BLOCK_TYPE_PSP)
};
+static void amdgpu_device_load_switch_state(struct amdgpu_device *adev);
+
static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev,
enum amd_ip_block_type block)
{
@@ -5013,7 +5015,8 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev)
adev->reset_domain = NULL;
kfree(adev->pci_state);
-
+ kfree(adev->pcie_reset_ctx.swds_pcistate);
+ kfree(adev->pcie_reset_ctx.swus_pcistate);
}
/**
@@ -6986,16 +6989,34 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
struct amdgpu_device *tmp_adev;
struct amdgpu_hive_info *hive;
struct list_head device_list;
- int r = 0, i;
+ struct pci_dev *link_dev;
+ int r = 0, i, timeout;
u32 memsize;
+ u16 status;
dev_info(adev->dev, "PCI error: slot reset callback!!\n");
memset(&reset_context, 0, sizeof(reset_context));
- /* wait for asic to come out of reset */
- msleep(700);
+ if (adev->pcie_reset_ctx.swus)
+ link_dev = adev->pcie_reset_ctx.swus;
+ else
+ link_dev = adev->pdev;
+ /* wait for asic to come out of reset, timeout = 10s */
+ timeout = 10000;
+ do {
+ usleep_range(10000, 10500);
+ r = pci_read_config_word(link_dev, PCI_VENDOR_ID, &status);
+ timeout -= 10;
+ } while (timeout > 0 && (status != PCI_VENDOR_ID_ATI) &&
+ (status != PCI_VENDOR_ID_AMD));
+ if ((status != PCI_VENDOR_ID_ATI) && (status != PCI_VENDOR_ID_AMD)) {
+ r = -ETIME;
+ goto out;
+ }
+
+ amdgpu_device_load_switch_state(adev);
/* Restore PCI confspace */
amdgpu_device_load_pci_state(pdev);
@@ -7097,6 +7118,58 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
}
}
+static void amdgpu_device_cache_switch_state(struct amdgpu_device *adev)
+{
+ struct pci_dev *parent = pci_upstream_bridge(adev->pdev);
+ int r;
+
+ if (parent->vendor != PCI_VENDOR_ID_ATI)
+ return;
+
+ /* If already saved, return */
+ if (adev->pcie_reset_ctx.swus)
+ return;
+ /* Upstream bridge is ATI, assume it's SWUS/DS architecture */
+ r = pci_save_state(parent);
+ if (r)
+ return;
+ adev->pcie_reset_ctx.swds_pcistate = pci_store_saved_state(parent);
+
+ parent = pci_upstream_bridge(parent);
+ r = pci_save_state(parent);
+ if (r)
+ return;
+ adev->pcie_reset_ctx.swus_pcistate = pci_store_saved_state(parent);
+
+ adev->pcie_reset_ctx.swus = parent;
+}
+
+static void amdgpu_device_load_switch_state(struct amdgpu_device *adev)
+{
+ struct pci_dev *pdev;
+ int r;
+
+ if (!adev->pcie_reset_ctx.swds_pcistate ||
+ !adev->pcie_reset_ctx.swus_pcistate)
+ return;
+
+ pdev = adev->pcie_reset_ctx.swus;
+ r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swus_pcistate);
+ if (!r) {
+ pci_restore_state(pdev);
+ } else {
+ dev_warn(adev->dev, "Failed to load SWUS state, err:%d\n", r);
+ return;
+ }
+
+ pdev = pci_upstream_bridge(adev->pdev);
+ r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swds_pcistate);
+ if (!r)
+ pci_restore_state(pdev);
+ else
+ dev_warn(adev->dev, "Failed to load SWDS state, err:%d\n", r);
+}
+
bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
{
struct drm_device *dev = pci_get_drvdata(pdev);
@@ -7121,6 +7194,8 @@ bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
return false;
}
+ amdgpu_device_cache_switch_state(adev);
+
return true;
}
@@ -7556,4 +7631,4 @@ u64 amdgpu_device_get_uid(struct amdgpu_uid *uid_info,
}
return uid_info->uid[type][inst];
-} \ No newline at end of file
+}