From 8ed9aaae39f39130b7a3eb2726be05d7f64b344c Mon Sep 17 00:00:00 2001 From: Rodrigo Vivi Date: Tue, 23 Apr 2024 18:18:16 -0400 Subject: drm/xe: Force wedged state and block GT reset upon any GPU hang In many validation situations when debugging GPU Hangs, it is useful to preserve the GT situation from the moment that the timeout occurred. This patch introduces a module parameter that could be used on situations like this. If xe.wedged module parameter is set to 2, Xe will be declared wedged on every single execution timeout (a.k.a. GPU hang) right after devcoredump snapshot capture and without attempting any kind of GT reset and blocking entirely any kind of execution. v2: Really block gt_reset from guc side. (Lucas) s/wedged/busted (Lucas) v3: - s/busted/wedged - Really use global_flags (Dafna) - More robust timeout handling when wedging it. v4: A really robust clean exit done by Matt Brost. No more kernel warns on unbind. v5: Simplify error message (Lucas) Cc: Matthew Brost Cc: Dafna Hirschfeld Cc: Lucas De Marchi Cc: Alan Previn Cc: Himanshu Somaiya Reviewed-by: Lucas De Marchi Link: https://patchwork.freedesktop.org/patch/msgid/20240423221817.1285081-3-rodrigo.vivi@intel.com Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_exec_queue.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'drivers/gpu/drm/xe/xe_exec_queue.h') diff --git a/drivers/gpu/drm/xe/xe_exec_queue.h b/drivers/gpu/drm/xe/xe_exec_queue.h index 02ce8d204622..48f6da53a292 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue.h +++ b/drivers/gpu/drm/xe/xe_exec_queue.h @@ -26,6 +26,15 @@ void xe_exec_queue_fini(struct xe_exec_queue *q); void xe_exec_queue_destroy(struct kref *ref); void xe_exec_queue_assign_name(struct xe_exec_queue *q, u32 instance); +static inline struct xe_exec_queue * +xe_exec_queue_get_unless_zero(struct xe_exec_queue *q) +{ + if (kref_get_unless_zero(&q->refcount)) + return q; + + return NULL; +} + struct xe_exec_queue *xe_exec_queue_lookup(struct xe_file *xef, u32 id); static inline struct xe_exec_queue *xe_exec_queue_get(struct xe_exec_queue *q) -- cgit v1.2.3 From 6109f24f87d75122cf6de50901115cbee4285ce2 Mon Sep 17 00:00:00 2001 From: Umesh Nerlige Ramappa Date: Fri, 17 May 2024 13:43:07 -0700 Subject: drm/xe: Add helper to accumulate exec queue runtime Add a helper to accumulate per-client runtime of all its exec queues. This is called every time a sched job is finished. v2: - Use guc_exec_queue_free_job() and execlist_job_free() to accumulate runtime when job is finished since xe_sched_job_completed() is not a notification that job finished. - Stop trying to update runtime from xe_exec_queue_fini() - that is redundant and may happen after xef is closed, leading to a use-after-free - Do not special case the first timestamp read: the default LRC sets CTX_TIMESTAMP to zero, so even the first sample should be a valid one. - Handle the parallel submission case by multiplying the runtime by width. v3: Update comments Signed-off-by: Umesh Nerlige Ramappa Reviewed-by: Matt Roper Link: https://patchwork.freedesktop.org/patch/msgid/20240517204310.88854-6-lucas.demarchi@intel.com Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_exec_queue.h | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/xe/xe_exec_queue.h') diff --git a/drivers/gpu/drm/xe/xe_exec_queue.h b/drivers/gpu/drm/xe/xe_exec_queue.h index 48f6da53a292..e0f07d28ee1a 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue.h +++ b/drivers/gpu/drm/xe/xe_exec_queue.h @@ -75,5 +75,6 @@ struct dma_fence *xe_exec_queue_last_fence_get(struct xe_exec_queue *e, struct xe_vm *vm); void xe_exec_queue_last_fence_set(struct xe_exec_queue *e, struct xe_vm *vm, struct dma_fence *fence); +void xe_exec_queue_update_runtime(struct xe_exec_queue *q); #endif -- cgit v1.2.3 From 45bb564de0a6f87e9f502ceb4ff4d9f936365c85 Mon Sep 17 00:00:00 2001 From: Umesh Nerlige Ramappa Date: Fri, 24 May 2024 16:47:43 -0700 Subject: drm/xe: Use run_ticks instead of runtime for client stats Note that runtime is also used in the pm context, so it is confusing to use the same name to denote run time of the drm client. Use a more appropriate name for the client utilization. While at it, drop the incorrect multi-lrc comment in the helper description v2: s/show_runtime/show_run_ticks/ (Rodrigo) Signed-off-by: Umesh Nerlige Ramappa Reviewed-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20240524234744.1352543-1-umesh.nerlige.ramappa@intel.com --- drivers/gpu/drm/xe/xe_exec_queue.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/xe/xe_exec_queue.h') diff --git a/drivers/gpu/drm/xe/xe_exec_queue.h b/drivers/gpu/drm/xe/xe_exec_queue.h index e0f07d28ee1a..289a3a51d2a2 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue.h +++ b/drivers/gpu/drm/xe/xe_exec_queue.h @@ -75,6 +75,6 @@ struct dma_fence *xe_exec_queue_last_fence_get(struct xe_exec_queue *e, struct xe_vm *vm); void xe_exec_queue_last_fence_set(struct xe_exec_queue *e, struct xe_vm *vm, struct dma_fence *fence); -void xe_exec_queue_update_runtime(struct xe_exec_queue *q); +void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q); #endif -- cgit v1.2.3