diff options
| author | Christian König <christian.koenig@amd.com> | 2026-04-21 12:39:54 +0200 |
|---|---|---|
| committer | Alex Deucher <alexander.deucher@amd.com> | 2026-05-19 12:07:42 -0400 |
| commit | c8ed2de0f2ee842d108ef96c125e931ea82b453c (patch) | |
| tree | fc4e44f7aa4df37120892ec400658a8851c983db /drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | |
| parent | be045c5c8305fca1214ebdd4b8433ac80635339b (diff) | |
drm/amdgpu: rework userq reset work handling
It is illegal to schedule reset work from another reset work!
Fix this by scheduling the userq reset work directly on the work queue
of the reset domain.
Not fully tested, I leave that to the IGT test cases.
Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Prike Liang <Prike.Liang@amd.com>
Reviewed-by: Sunil Khatri <sunil.khatri@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
(cherry picked from commit fd9200ccefab94f27877d1943761d6b0ccbd89c8)
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 84 |
1 files changed, 44 insertions, 40 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index 8841955927bb..aa6d4c71fba6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -82,19 +82,11 @@ static bool amdgpu_userq_is_reset_type_supported(struct amdgpu_device *adev, return false; } -static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev) -{ - if (amdgpu_device_should_recover_gpu(adev)) { - amdgpu_reset_domain_schedule(adev->reset_domain, - &adev->userq_reset_work); - /* Wait for the reset job to complete */ - flush_work(&adev->userq_reset_work); - } -} - -static int -amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr) +static void amdgpu_userq_mgr_reset_work(struct work_struct *work) { + struct amdgpu_userq_mgr *uq_mgr = + container_of(work, struct amdgpu_userq_mgr, + reset_work); struct amdgpu_device *adev = uq_mgr->adev; const int queue_types[] = { AMDGPU_RING_TYPE_COMPUTE, @@ -103,12 +95,11 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr) }; const int num_queue_types = ARRAY_SIZE(queue_types); bool gpu_reset = false; - int r = 0; - int i; + int i, r; if (unlikely(adev->debug_disable_gpu_ring_reset)) { dev_err(adev->dev, "userq reset disabled by debug mask\n"); - return 0; + return; } /* @@ -116,7 +107,7 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr) * skip all reset detection logic */ if (!amdgpu_gpu_recovery) - return 0; + return; /* * Iterate through all queue types to detect and reset problematic queues @@ -141,10 +132,19 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr) } } - if (gpu_reset) - amdgpu_userq_gpu_reset(adev); + if (gpu_reset) { + struct amdgpu_reset_context reset_context; - return r; + memset(&reset_context, 0, sizeof(reset_context)); + + reset_context.method = AMD_RESET_METHOD_NONE; + reset_context.reset_req_dev = adev; + reset_context.src = AMDGPU_RESET_SRC_USERQ; + set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); + /*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/ + + amdgpu_device_gpu_recover(adev, NULL, &reset_context); + } } static void amdgpu_userq_hang_detect_work(struct work_struct *work) @@ -153,7 +153,11 @@ static void amdgpu_userq_hang_detect_work(struct work_struct *work) container_of(work, struct amdgpu_usermode_queue, hang_detect_work.work); - amdgpu_userq_detect_and_reset_queues(queue->userq_mgr); + /* + * Don't schedule the work here! Scheduling or queue work from one reset + * handler to another is illegal if you don't take extra precautions! + */ + amdgpu_userq_mgr_reset_work(&queue->userq_mgr->reset_work); } /* @@ -182,8 +186,8 @@ void amdgpu_userq_start_hang_detect_work(struct amdgpu_usermode_queue *queue) break; } - schedule_delayed_work(&queue->hang_detect_work, - msecs_to_jiffies(timeout_ms)); + queue_delayed_work(adev->reset_domain->wq, &queue->hang_detect_work, + msecs_to_jiffies(timeout_ms)); } void amdgpu_userq_process_fence_irq(struct amdgpu_device *adev, u32 doorbell) @@ -1259,28 +1263,13 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr) if (ret) { drm_file_err(uq_mgr->file, "Couldn't unmap all the queues, eviction failed ret=%d\n", ret); - amdgpu_userq_detect_and_reset_queues(uq_mgr); + amdgpu_reset_domain_schedule(uq_mgr->adev->reset_domain, + &uq_mgr->reset_work); + flush_work(&uq_mgr->reset_work); } return ret; } -void amdgpu_userq_reset_work(struct work_struct *work) -{ - struct amdgpu_device *adev = container_of(work, struct amdgpu_device, - userq_reset_work); - struct amdgpu_reset_context reset_context; - - memset(&reset_context, 0, sizeof(reset_context)); - - reset_context.method = AMD_RESET_METHOD_NONE; - reset_context.reset_req_dev = adev; - reset_context.src = AMDGPU_RESET_SRC_USERQ; - set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); - /*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/ - - amdgpu_device_gpu_recover(adev, NULL, &reset_context); -} - static void amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr) { @@ -1314,9 +1303,24 @@ int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct drm_file *f userq_mgr->file = file_priv; INIT_DELAYED_WORK(&userq_mgr->resume_work, amdgpu_userq_restore_worker); + INIT_WORK(&userq_mgr->reset_work, amdgpu_userq_mgr_reset_work); return 0; } +void amdgpu_userq_mgr_cancel_reset_work(struct amdgpu_device *adev) +{ + struct xarray *xa = &adev->userq_doorbell_xa; + struct amdgpu_usermode_queue *queue; + unsigned long flags, queue_id; + + xa_lock_irqsave(xa, flags); + xa_for_each(xa, queue_id, queue) { + cancel_delayed_work(&queue->hang_detect_work); + cancel_work(&queue->userq_mgr->reset_work); + } + xa_unlock_irqrestore(xa, flags); +} + void amdgpu_userq_mgr_cancel_resume(struct amdgpu_userq_mgr *userq_mgr) { cancel_delayed_work_sync(&userq_mgr->resume_work); |
