From 6109f24f87d75122cf6de50901115cbee4285ce2 Mon Sep 17 00:00:00 2001 From: Umesh Nerlige Ramappa Date: Fri, 17 May 2024 13:43:07 -0700 Subject: drm/xe: Add helper to accumulate exec queue runtime Add a helper to accumulate per-client runtime of all its exec queues. This is called every time a sched job is finished. v2: - Use guc_exec_queue_free_job() and execlist_job_free() to accumulate runtime when job is finished since xe_sched_job_completed() is not a notification that job finished. - Stop trying to update runtime from xe_exec_queue_fini() - that is redundant and may happen after xef is closed, leading to a use-after-free - Do not special case the first timestamp read: the default LRC sets CTX_TIMESTAMP to zero, so even the first sample should be a valid one. - Handle the parallel submission case by multiplying the runtime by width. v3: Update comments Signed-off-by: Umesh Nerlige Ramappa Reviewed-by: Matt Roper Link: https://patchwork.freedesktop.org/patch/msgid/20240517204310.88854-6-lucas.demarchi@intel.com Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_exec_queue.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'drivers/gpu/drm/xe/xe_exec_queue.c') diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c index 395de93579fa..fa6dc996eca8 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue.c +++ b/drivers/gpu/drm/xe/xe_exec_queue.c @@ -769,6 +769,43 @@ bool xe_exec_queue_is_idle(struct xe_exec_queue *q) q->lrc[0].fence_ctx.next_seqno - 1; } +/** + * xe_exec_queue_update_runtime() - Update runtime for this exec queue from hw + * @q: The exec queue + * + * Update the timestamp saved by HW for this exec queue and save runtime + * calculated by using the delta from last update. On multi-lrc case, only the + * first is considered. + */ +void xe_exec_queue_update_runtime(struct xe_exec_queue *q) +{ + struct xe_file *xef; + struct xe_lrc *lrc; + u32 old_ts, new_ts; + + /* + * Jobs that are run during driver load may use an exec_queue, but are + * not associated with a user xe file, so avoid accumulating busyness + * for kernel specific work. + */ + if (!q->vm || !q->vm->xef) + return; + + xef = q->vm->xef; + + /* + * Only sample the first LRC. For parallel submission, all of them are + * scheduled together and we compensate that below by multiplying by + * width - this may introduce errors if that premise is not true and + * they don't exit 100% aligned. On the other hand, looping through + * the LRCs and reading them in different time could also introduce + * errors. + */ + lrc = &q->lrc[0]; + new_ts = xe_lrc_update_timestamp(lrc, &old_ts); + xef->runtime[q->class] += (new_ts - old_ts) * q->width; +} + void xe_exec_queue_kill(struct xe_exec_queue *q) { struct xe_exec_queue *eq = q, *next; -- cgit v1.2.3 From ad1e331fc451a2cffc72ae193b843682ce237e24 Mon Sep 17 00:00:00 2001 From: Rodrigo Vivi Date: Wed, 22 May 2024 13:01:01 -0400 Subject: drm/xe: Relax runtime pm protection during execution MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Limit the protection only during moments of actual job execution, and introduce protection for guc submit fini, which is currently unprotected due to the absence of exec_queue life protection. In the regular use case scenario, user space will create an exec queue, and keep it alive to reuse that until it is done with that kind of workload. For the regular desktop cases, it means that the exec_queue is alive even on idle scenarios where display goes off. This is unacceptable since this would entirely block runtime PM indefinitely, blocking deeper Package-C state. This would be a waste drainage of power. Cc: Matthew Brost Tested-by: Francois Dugast Reviewed-by: Thomas Hellström Link: https://patchwork.freedesktop.org/patch/msgid/20240522170105.327472-3-rodrigo.vivi@intel.com Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_exec_queue.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'drivers/gpu/drm/xe/xe_exec_queue.c') diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c index fa6dc996eca8..0fd61fb4d104 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue.c +++ b/drivers/gpu/drm/xe/xe_exec_queue.c @@ -106,7 +106,6 @@ static struct xe_exec_queue *__xe_exec_queue_alloc(struct xe_device *xe, static int __xe_exec_queue_init(struct xe_exec_queue *q) { - struct xe_device *xe = gt_to_xe(q->gt); int i, err; for (i = 0; i < q->width; ++i) { @@ -119,17 +118,6 @@ static int __xe_exec_queue_init(struct xe_exec_queue *q) if (err) goto err_lrc; - /* - * Normally the user vm holds an rpm ref to keep the device - * awake, and the context holds a ref for the vm, however for - * some engines we use the kernels migrate vm underneath which offers no - * such rpm ref, or we lack a vm. Make sure we keep a ref here, so we - * can perform GuC CT actions when needed. Caller is expected to have - * already grabbed the rpm ref outside any sensitive locks. - */ - if (!(q->flags & EXEC_QUEUE_FLAG_PERMANENT) && (q->flags & EXEC_QUEUE_FLAG_VM || !q->vm)) - xe_pm_runtime_get_noresume(xe); - return 0; err_lrc: @@ -216,8 +204,6 @@ void xe_exec_queue_fini(struct xe_exec_queue *q) for (i = 0; i < q->width; ++i) xe_lrc_finish(q->lrc + i); - if (!(q->flags & EXEC_QUEUE_FLAG_PERMANENT) && (q->flags & EXEC_QUEUE_FLAG_VM || !q->vm)) - xe_pm_runtime_put(gt_to_xe(q->gt)); __xe_exec_queue_free(q); } -- cgit v1.2.3 From 08f7200899ca72dec550af092ae424b7db099abd Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Mon, 27 May 2024 15:59:08 +0200 Subject: drm/xe: Decouple job seqno and lrc seqno MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tightly coupling these seqno presents problems if alternative fences for jobs are used. Decouple these for correctness. v2: - Slightly reword commit message (Thomas) - Make sure the lrc fence ops are used in comparison (Thomas) - Assume seqno is unsigned rather than signed in format string (Thomas) Cc: Thomas Hellström Signed-off-by: Matthew Brost Signed-off-by: Thomas Hellström Reviewed-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20240527135912.152156-2-thomas.hellstrom@linux.intel.com --- drivers/gpu/drm/xe/xe_exec_queue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/xe/xe_exec_queue.c') diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c index 0fd61fb4d104..e8bf250f5b6a 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue.c +++ b/drivers/gpu/drm/xe/xe_exec_queue.c @@ -98,7 +98,7 @@ static struct xe_exec_queue *__xe_exec_queue_alloc(struct xe_device *xe, if (xe_exec_queue_is_parallel(q)) { q->parallel.composite_fence_ctx = dma_fence_context_alloc(1); - q->parallel.composite_fence_seqno = XE_FENCE_INITIAL_SEQNO; + q->parallel.composite_fence_seqno = 0; } return q; -- cgit v1.2.3 From 0ac7a2c745e8a42803378b944fa0f4455b7240f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Mon, 27 May 2024 15:59:10 +0200 Subject: drm/xe: Don't initialize fences at xe_sched_job_create() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-allocate but don't initialize fences at xe_sched_job_create(), and initialize / arm them instead at xe_sched_job_arm(). This makes it possible to move xe_sched_job_create() with its memory allocation out of any lock that is required for fence initialization, and that may not allow memory allocation under it. Replaces the struct dma_fence_array for parallell jobs with a struct dma_fence_chain, since the former doesn't allow a split-up between allocation and initialization. v2: - Rebase. - Don't always use the first lrc when initializing parallel lrc fences. - Use dma_fence_chain_contained() to access the lrc fences. v4: - Add an assert that job->lrc_seqno == fence->seqno. (Matthew Brost) Signed-off-by: Thomas Hellström Reviewed-by: Matthew Brost Reviewed-by: Rodrigo Vivi #v2 Link: https://patchwork.freedesktop.org/patch/msgid/20240527135912.152156-4-thomas.hellstrom@linux.intel.com --- drivers/gpu/drm/xe/xe_exec_queue.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'drivers/gpu/drm/xe/xe_exec_queue.c') diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c index e8bf250f5b6a..e3cebec3de24 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue.c +++ b/drivers/gpu/drm/xe/xe_exec_queue.c @@ -96,11 +96,6 @@ static struct xe_exec_queue *__xe_exec_queue_alloc(struct xe_device *xe, } } - if (xe_exec_queue_is_parallel(q)) { - q->parallel.composite_fence_ctx = dma_fence_context_alloc(1); - q->parallel.composite_fence_seqno = 0; - } - return q; } -- cgit v1.2.3 From 45bb564de0a6f87e9f502ceb4ff4d9f936365c85 Mon Sep 17 00:00:00 2001 From: Umesh Nerlige Ramappa Date: Fri, 24 May 2024 16:47:43 -0700 Subject: drm/xe: Use run_ticks instead of runtime for client stats Note that runtime is also used in the pm context, so it is confusing to use the same name to denote run time of the drm client. Use a more appropriate name for the client utilization. While at it, drop the incorrect multi-lrc comment in the helper description v2: s/show_runtime/show_run_ticks/ (Rodrigo) Signed-off-by: Umesh Nerlige Ramappa Reviewed-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20240524234744.1352543-1-umesh.nerlige.ramappa@intel.com --- drivers/gpu/drm/xe/xe_exec_queue.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'drivers/gpu/drm/xe/xe_exec_queue.c') diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c index e3cebec3de24..f4359479318b 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue.c +++ b/drivers/gpu/drm/xe/xe_exec_queue.c @@ -751,14 +751,14 @@ bool xe_exec_queue_is_idle(struct xe_exec_queue *q) } /** - * xe_exec_queue_update_runtime() - Update runtime for this exec queue from hw + * xe_exec_queue_update_run_ticks() - Update run time in ticks for this exec queue + * from hw * @q: The exec queue * - * Update the timestamp saved by HW for this exec queue and save runtime - * calculated by using the delta from last update. On multi-lrc case, only the - * first is considered. + * Update the timestamp saved by HW for this exec queue and save run ticks + * calculated by using the delta from last update. */ -void xe_exec_queue_update_runtime(struct xe_exec_queue *q) +void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q) { struct xe_file *xef; struct xe_lrc *lrc; @@ -784,7 +784,7 @@ void xe_exec_queue_update_runtime(struct xe_exec_queue *q) */ lrc = &q->lrc[0]; new_ts = xe_lrc_update_timestamp(lrc, &old_ts); - xef->runtime[q->class] += (new_ts - old_ts) * q->width; + xef->run_ticks[q->class] += (new_ts - old_ts) * q->width; } void xe_exec_queue_kill(struct xe_exec_queue *q) -- cgit v1.2.3 From ce62827bc294ba5f8b3909bfa5d7dbf9de8aab6b Mon Sep 17 00:00:00 2001 From: Umesh Nerlige Ramappa Date: Fri, 24 May 2024 16:47:44 -0700 Subject: drm/xe: Do not access xe file when updating exec queue run_ticks The current code is running into a use after free case where xe file is closed before the exec queue run_ticks can be updated. This is occurring in the xe_file_close path. To fix that, do not access xe file when updating the exec queue run_ticks. Instead store the exec queue run_ticks locally in the exec queue object and accumulate it when the user dumps the drm client stats. We know that the xe file is valid when user is dumping the run_ticks for the drm client, so this effectively removes the dependency on xe file object in xe_exec_queue_update_run_ticks(). v2: - Fix the accumulation of q->run_ticks delta into xe file run_ticks - s/runtime/run_ticks/ (Rodrigo) Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/1908 Fixes: 6109f24f87d7 ("drm/xe: Add helper to accumulate exec queue runtime") Signed-off-by: Umesh Nerlige Ramappa Reviewed-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20240524234744.1352543-2-umesh.nerlige.ramappa@intel.com --- drivers/gpu/drm/xe/xe_exec_queue.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'drivers/gpu/drm/xe/xe_exec_queue.c') diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c index f4359479318b..a2daae10ccc6 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue.c +++ b/drivers/gpu/drm/xe/xe_exec_queue.c @@ -760,7 +760,6 @@ bool xe_exec_queue_is_idle(struct xe_exec_queue *q) */ void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q) { - struct xe_file *xef; struct xe_lrc *lrc; u32 old_ts, new_ts; @@ -772,8 +771,6 @@ void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q) if (!q->vm || !q->vm->xef) return; - xef = q->vm->xef; - /* * Only sample the first LRC. For parallel submission, all of them are * scheduled together and we compensate that below by multiplying by @@ -784,7 +781,7 @@ void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q) */ lrc = &q->lrc[0]; new_ts = xe_lrc_update_timestamp(lrc, &old_ts); - xef->run_ticks[q->class] += (new_ts - old_ts) * q->width; + q->run_ticks += (new_ts - old_ts) * q->width; } void xe_exec_queue_kill(struct xe_exec_queue *q) -- cgit v1.2.3 From 264eecdba211bbeb8c0ed313ffe03e9dd1e20262 Mon Sep 17 00:00:00 2001 From: Niranjana Vishwanathapura Date: Wed, 29 May 2024 20:22:10 -0700 Subject: drm/xe: Decouple xe_exec_queue and xe_lrc Decouple xe_lrc from xe_exec_queue and reference count xe_lrc. Removing hard coupling between xe_exec_queue and xe_lrc allows flexible design where the user interface xe_exec_queue can be destroyed independent of the hardware/firmware interface xe_lrc. v2: Fix lrc indexing in wq_item_append() Signed-off-by: Niranjana Vishwanathapura Reviewed-by: Matthew Brost Signed-off-by: Matthew Brost Link: https://patchwork.freedesktop.org/patch/msgid/20240530032211.29299-1-niranjana.vishwanathapura@intel.com --- drivers/gpu/drm/xe/xe_exec_queue.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'drivers/gpu/drm/xe/xe_exec_queue.c') diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c index a2daae10ccc6..27215075c799 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue.c +++ b/drivers/gpu/drm/xe/xe_exec_queue.c @@ -86,7 +86,7 @@ static struct xe_exec_queue *__xe_exec_queue_alloc(struct xe_device *xe, if (extensions) { /* - * may set q->usm, must come before xe_lrc_init(), + * may set q->usm, must come before xe_lrc_create(), * may overwrite q->sched_props, must come before q->ops->init() */ err = exec_queue_user_extensions(xe, q, extensions, 0); @@ -104,9 +104,11 @@ static int __xe_exec_queue_init(struct xe_exec_queue *q) int i, err; for (i = 0; i < q->width; ++i) { - err = xe_lrc_init(q->lrc + i, q->hwe, q, q->vm, SZ_16K); - if (err) + q->lrc[i] = xe_lrc_create(q->hwe, q->vm, SZ_16K); + if (IS_ERR(q->lrc[i])) { + err = PTR_ERR(q->lrc[i]); goto err_lrc; + } } err = q->ops->init(q); @@ -117,7 +119,7 @@ static int __xe_exec_queue_init(struct xe_exec_queue *q) err_lrc: for (i = i - 1; i >= 0; --i) - xe_lrc_finish(q->lrc + i); + xe_lrc_put(q->lrc[i]); return err; } @@ -198,7 +200,7 @@ void xe_exec_queue_fini(struct xe_exec_queue *q) int i; for (i = 0; i < q->width; ++i) - xe_lrc_finish(q->lrc + i); + xe_lrc_put(q->lrc[i]); __xe_exec_queue_free(q); } @@ -701,7 +703,7 @@ bool xe_exec_queue_is_lr(struct xe_exec_queue *q) static s32 xe_exec_queue_num_job_inflight(struct xe_exec_queue *q) { - return q->lrc->fence_ctx.next_seqno - xe_lrc_seqno(q->lrc) - 1; + return q->lrc[0]->fence_ctx.next_seqno - xe_lrc_seqno(q->lrc[0]) - 1; } /** @@ -712,7 +714,7 @@ static s32 xe_exec_queue_num_job_inflight(struct xe_exec_queue *q) */ bool xe_exec_queue_ring_full(struct xe_exec_queue *q) { - struct xe_lrc *lrc = q->lrc; + struct xe_lrc *lrc = q->lrc[0]; s32 max_job = lrc->ring.size / MAX_JOB_SIZE_BYTES; return xe_exec_queue_num_job_inflight(q) >= max_job; @@ -738,16 +740,16 @@ bool xe_exec_queue_is_idle(struct xe_exec_queue *q) int i; for (i = 0; i < q->width; ++i) { - if (xe_lrc_seqno(&q->lrc[i]) != - q->lrc[i].fence_ctx.next_seqno - 1) + if (xe_lrc_seqno(q->lrc[i]) != + q->lrc[i]->fence_ctx.next_seqno - 1) return false; } return true; } - return xe_lrc_seqno(&q->lrc[0]) == - q->lrc[0].fence_ctx.next_seqno - 1; + return xe_lrc_seqno(q->lrc[0]) == + q->lrc[0]->fence_ctx.next_seqno - 1; } /** @@ -779,7 +781,7 @@ void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q) * the LRCs and reading them in different time could also introduce * errors. */ - lrc = &q->lrc[0]; + lrc = q->lrc[0]; new_ts = xe_lrc_update_timestamp(lrc, &old_ts); q->run_ticks += (new_ts - old_ts) * q->width; } -- cgit v1.2.3