drm/amdkfd: Queue interrupt work to different CPU

For CPX mode, each KFD node has interrupt worker to process ih_fifo to send events to user space. Currently all interrupt workers of same adev queue to same CPU, all workers execution are actually serialized and this cause KFD ih_fifo overflow when CPU usage is high. Use per-GPU unbounded highpri queue with number of workers equals to number of partitions, let queue_work select the next CPU round robin among the local CPUs of same NUMA. Signed-off-by: Philip Yang <Philip.Yang@amd.com> Reviewed-by: Felix Kuehling <felix.kuehling@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
author: Philip Yang <Philip.Yang@amd.com> 2024-11-26 11:33:15 -0500
committer: Alex Deucher <alexander.deucher@amd.com> 2024-12-18 12:39:07 -0500
commit: 34db5a32617d102e8042151bb87590e43c97132e (patch)
tree: c6e752e4208514bb16a697901bf6daf735d56afd /drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
parent: 1b00143231d3e6f4b76f88f4edd6bb8a1332ef9b (diff)
1 files changed, 9 insertions, 16 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
index 8e00800f3207..6beb786c582a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
@@ -62,11 +62,14 @@ int kfd_interrupt_init(struct kfd_node *node)
 		return r;
 	}
 
-	node->ih_wq = alloc_workqueue("KFD IH", WQ_HIGHPRI, 1);
-	if (unlikely(!node->ih_wq)) {
-		kfifo_free(&node->ih_fifo);
-		dev_err(node->adev->dev, "Failed to allocate KFD IH workqueue\n");
-		return -ENOMEM;
+	if (!node->kfd->ih_wq) {
+		node->kfd->ih_wq = alloc_workqueue("KFD IH", WQ_HIGHPRI | WQ_UNBOUND,
+						   node->kfd->num_nodes);
+		if (unlikely(!node->kfd->ih_wq)) {
+			kfifo_free(&node->ih_fifo);
+			dev_err(node->adev->dev, "Failed to allocate KFD IH workqueue\n");
+			return -ENOMEM;
+		}
 	}
 	spin_lock_init(&node->interrupt_lock);
 
@@ -96,16 +99,6 @@ void kfd_interrupt_exit(struct kfd_node *node)
 	spin_lock_irqsave(&node->interrupt_lock, flags);
 	node->interrupts_active = false;
 	spin_unlock_irqrestore(&node->interrupt_lock, flags);
-
-	/*
-	 * flush_work ensures that there are no outstanding
-	 * work-queue items that will access interrupt_ring. New work items
-	 * can't be created because we stopped interrupt handling above.
-	 */
-	flush_workqueue(node->ih_wq);
-
-	destroy_workqueue(node->ih_wq);
-
 	kfifo_free(&node->ih_fifo);
 }
 
@@ -155,7 +148,7 @@ static void interrupt_wq(struct work_struct *work)
 			/* If we spent more than a second processing signals,
 			 * reschedule the worker to avoid soft-lockup warnings
 			 */
-			queue_work(dev->ih_wq, &dev->interrupt_work);
+			queue_work(dev->kfd->ih_wq, &dev->interrupt_work);
 			break;
 		}
 	}
author	Philip Yang <Philip.Yang@amd.com>	2024-11-26 11:33:15 -0500
committer	Alex Deucher <alexander.deucher@amd.com>	2024-12-18 12:39:07 -0500
commit	34db5a32617d102e8042151bb87590e43c97132e (patch)
tree	c6e752e4208514bb16a697901bf6daf735d56afd /drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
parent	1b00143231d3e6f4b76f88f4edd6bb8a1332ef9b (diff)