drm/xe/vf: Ensure media GT VF recovery runs after primary GT on PTL

It is possible that the media GT's VF post-migration recovery work item gets scheduled before the primary GT's work item. Since the media GT depends on the primary GT's work item to complete CCS restore, if the media GT's work item is scheduled first, detect this condition and re-queue the media GT's work item for a later time. v5: - Adjust debug message (Tomasz) Signed-off-by: Matthew Brost <matthew.brost@intel.com> Reviewed-by: Tomasz Lis <tomasz.lis@intel.com> Link: https://lore.kernel.org/r/20251008214532.3442967-33-matthew.brost@intel.com
author: Matthew Brost <matthew.brost@intel.com> 2025-10-08 14:45:30 -0700
committer: Matthew Brost <matthew.brost@intel.com> 2025-10-09 03:24:28 -0700
commit: a093570ecd2239ea19db5a158ae616f3dcbec048 (patch)
tree: e26fdd876c34c1f88c60b56e714883219b0aef93 /drivers/gpu/drm/xe
parent: efd38d619a03b2aadf124a4ddeb4054a447e2c8c (diff)
1 files changed, 27 insertions, 2 deletions
diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_vf.c b/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
index edd98a4da306..b527f78da295 100644
--- a/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
+++ b/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
@@ -1107,8 +1107,22 @@ void xe_gt_sriov_vf_print_version(struct xe_gt *gt, struct drm_printer *p)
 		   pf_version->major, pf_version->minor);
 }
 
-static void vf_post_migration_shutdown(struct xe_gt *gt)
+static bool vf_post_migration_shutdown(struct xe_gt *gt)
 {
+	struct xe_device *xe = gt_to_xe(gt);
+
+	/*
+	 * On platforms where CCS must be restored by the primary GT, the media
+	 * GT's VF post-migration recovery must run afterward. Detect this case
+	 * and re-queue the media GT's restore work item if necessary.
+	 */
+	if (xe->info.needs_shared_vf_gt_wq && xe_gt_is_media_type(gt)) {
+		struct xe_gt *primary_gt = gt_to_tile(gt)->primary_gt;
+
+		if (xe_gt_sriov_vf_recovery_pending(primary_gt))
+			return true;
+	}
+
 	spin_lock_irq(&gt->sriov.vf.migration.lock);
 	gt->sriov.vf.migration.recovery_queued = false;
 	spin_unlock_irq(&gt->sriov.vf.migration.lock);
@@ -1116,6 +1130,8 @@ static void vf_post_migration_shutdown(struct xe_gt *gt)
 	xe_guc_ct_flush_and_stop(&gt->uc.guc.ct);
 	xe_guc_submit_pause(&gt->uc.guc);
 	xe_tlb_inval_reset(&gt->tlb_inval);
+
+	return false;
 }
 
 static size_t post_migration_scratch_size(struct xe_device *xe)
@@ -1194,11 +1210,14 @@ static void vf_post_migration_recovery(struct xe_gt *gt)
 {
 	struct xe_device *xe = gt_to_xe(gt);
 	int err;
+	bool retry;
 
 	xe_gt_sriov_dbg(gt, "migration recovery in progress\n");
 
 	xe_pm_runtime_get(xe);
-	vf_post_migration_shutdown(gt);
+	retry = vf_post_migration_shutdown(gt);
+	if (retry)
+		goto queue;
 
 	if (!xe_sriov_vf_migration_supported(xe)) {
 		xe_gt_sriov_err(gt, "migration is not supported\n");
@@ -1226,6 +1245,12 @@ fail:
 	xe_pm_runtime_put(xe);
 	xe_gt_sriov_err(gt, "migration recovery failed (%pe)\n", ERR_PTR(err));
 	xe_device_declare_wedged(xe);
+	return;
+
+queue:
+	xe_gt_sriov_info(gt, "Re-queuing migration recovery\n");
+	queue_work(gt->ordered_wq, &gt->sriov.vf.migration.worker);
+	xe_pm_runtime_put(xe);
 }
 
 static void migration_worker_func(struct work_struct *w)
author	Matthew Brost <matthew.brost@intel.com>	2025-10-08 14:45:30 -0700
committer	Matthew Brost <matthew.brost@intel.com>	2025-10-09 03:24:28 -0700
commit	a093570ecd2239ea19db5a158ae616f3dcbec048 (patch)
tree	e26fdd876c34c1f88c60b56e714883219b0aef93 /drivers/gpu/drm/xe
parent	efd38d619a03b2aadf124a4ddeb4054a447e2c8c (diff)