summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMario Limonciello (AMD) <superm1@kernel.org>2026-01-07 15:37:28 -0600
committerAlex Deucher <alexander.deucher@amd.com>2026-01-14 14:51:36 -0500
commit28695ca09d326461f8078332aa01db516983e8a2 (patch)
tree12a56f7a9e76d504a6aa57e84e8c434ba4fd47f2
parent9cb6278b44c38899961b36d303d7b18b38be2a6e (diff)
drm/amd: Clean up kfd node on surprise disconnect
When an eGPU is unplugged the KFD topology should also be destroyed for that GPU. This never happens because the fini_sw callbacks never get to run. Run them manually before calling amdgpu_device_ip_fini_early() when a device has already been disconnected. This location is intentionally chosen to make sure that the kfd locking refcount doesn't get incremented unintentionally. Cc: kent.russell@amd.com Closes: https://community.frame.work/t/amd-egpu-on-linux/8691/33 Signed-off-by: Mario Limonciello (AMD) <superm1@kernel.org> Reviewed-by: Kent Russell <kent.russell@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> (cherry picked from commit 6a23e7b4332c10f8b56c33a9c5431b52ecff9aab) Cc: stable@vger.kernel.org
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c8
1 files changed, 8 insertions, 0 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index d5c44bd34d45..d2c3885de711 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5063,6 +5063,14 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
amdgpu_ttm_set_buffer_funcs_status(adev, false);
+ /*
+ * device went through surprise hotplug; we need to destroy topology
+ * before ip_fini_early to prevent kfd locking refcount issues by calling
+ * amdgpu_amdkfd_suspend()
+ */
+ if (drm_dev_is_unplugged(adev_to_drm(adev)))
+ amdgpu_amdkfd_device_fini_sw(adev);
+
amdgpu_device_ip_fini_early(adev);
amdgpu_irq_fini_hw(adev);