From a3ffaa5b397f4df9d6ac16b10583e9df8e6fa471 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 17 Mar 2026 16:34:41 -0400 Subject: drm/amdgpu/pm: drop SMU driver if version not matched messages It just leads to user confusion. Cc: Yang Wang Cc: Lijo Lazar Reviewed-by: Yang Wang Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher (cherry picked from commit e471627d56272a791972f25e467348b611c31713) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c | 1 - drivers/gpu/drm/amd/pm/swsmu/smu12/smu_v12_0.c | 1 - drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0.c | 1 - 3 files changed, 3 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c index 12b052d920f5..7ca8fdd23206 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c @@ -262,7 +262,6 @@ int smu_v11_0_check_fw_version(struct smu_context *smu) "smu fw program = %d, version = 0x%08x (%d.%d.%d)\n", smu->smc_driver_if_version, if_version, smu_program, smu_version, smu_major, smu_minor, smu_debug); - dev_info(smu->adev->dev, "SMU driver if version not matched\n"); } return ret; diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu12/smu_v12_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu12/smu_v12_0.c index 2c20624caca4..ac5e44dff6c9 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu12/smu_v12_0.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu12/smu_v12_0.c @@ -101,7 +101,6 @@ int smu_v12_0_check_fw_version(struct smu_context *smu) "smu fw program = %d, smu fw version = 0x%08x (%d.%d.%d)\n", smu->smc_driver_if_version, if_version, smu_program, smu_version, smu_major, smu_minor, smu_debug); - dev_info(smu->adev->dev, "SMU driver if version not matched\n"); } return ret; diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0.c index cec2df1ad0af..e38354c694c9 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0.c @@ -284,7 +284,6 @@ int smu_v14_0_check_fw_version(struct smu_context *smu) "smu fw program = %d, smu fw version = 0x%08x (%d.%d.%d)\n", smu->smc_driver_if_version, if_version, smu_program, smu_version, smu_major, smu_minor, smu_debug); - dev_info(adev->dev, "SMU driver if version not matched\n"); } return ret; -- cgit v1.2.3 From a018d1819f158991b7308e4f74609c6c029b670c Mon Sep 17 00:00:00 2001 From: Junrui Luo Date: Tue, 24 Mar 2026 17:39:02 +0800 Subject: drm/amdgpu: validate doorbell_offset in user queue creation amdgpu_userq_get_doorbell_index() passes the user-provided doorbell_offset to amdgpu_doorbell_index_on_bar() without bounds checking. An arbitrarily large doorbell_offset can cause the calculated doorbell index to fall outside the allocated doorbell BO, potentially corrupting kernel doorbell space. Validate that doorbell_offset falls within the doorbell BO before computing the BAR index, using u64 arithmetic to prevent overflow. Fixes: f09c1e6077ab ("drm/amdgpu: generate doorbell index for userqueue") Reported-by: Yuhao Jiang Signed-off-by: Junrui Luo Signed-off-by: Alex Deucher (cherry picked from commit de1ef4ffd70e1d15f0bf584fd22b1f28cbd5e2ec) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index 7c450350847d..0a1b93259887 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -600,6 +600,13 @@ amdgpu_userq_get_doorbell_index(struct amdgpu_userq_mgr *uq_mgr, goto unpin_bo; } + /* Validate doorbell_offset is within the doorbell BO */ + if ((u64)db_info->doorbell_offset * db_size + db_size > + amdgpu_bo_size(db_obj->obj)) { + r = -EINVAL; + goto unpin_bo; + } + index = amdgpu_doorbell_index_on_bar(uq_mgr->adev, db_obj->obj, db_info->doorbell_offset, db_size); drm_dbg_driver(adev_to_drm(uq_mgr->adev), -- cgit v1.2.3 From 62f553d60a801384336f5867967c26ddf3b17038 Mon Sep 17 00:00:00 2001 From: Prike Liang Date: Mon, 23 Mar 2026 16:07:02 +0800 Subject: drm/amdgpu: fix the idr allocation flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the IDR allocation flags by using atomic GFP flags in non‑sleepable contexts to avoid the __might_sleep() complaint. 268.290239] [drm] Initialized amdgpu 3.64.0 for 0000:03:00.0 on minor 0 [ 268.294900] BUG: sleeping function called from invalid context at ./include/linux/sched/mm.h:323 [ 268.295355] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 1744, name: modprobe [ 268.295705] preempt_count: 1, expected: 0 [ 268.295886] RCU nest depth: 0, expected: 0 [ 268.296072] 2 locks held by modprobe/1744: [ 268.296077] #0: ffff8c3a44abd1b8 (&dev->mutex){....}-{4:4}, at: __driver_attach+0xe4/0x210 [ 268.296100] #1: ffffffffc1a6ea78 (amdgpu_pasid_idr_lock){+.+.}-{3:3}, at: amdgpu_pasid_alloc+0x26/0xe0 [amdgpu] [ 268.296494] CPU: 12 UID: 0 PID: 1744 Comm: modprobe Tainted: G U OE 6.19.0-custom #16 PREEMPT(voluntary) [ 268.296498] Tainted: [U]=USER, [O]=OOT_MODULE, [E]=UNSIGNED_MODULE [ 268.296499] Hardware name: AMD Majolica-RN/Majolica-RN, BIOS RMJ1009A 06/13/2021 [ 268.296501] Call Trace: Fixes: 8f1de51f49be ("drm/amdgpu: prevent immediate PASID reuse case") Tested-by: Borislav Petkov (AMD) Signed-off-by: Prike Liang Reviewed-by: Christian König Signed-off-by: Alex Deucher (cherry picked from commit ea56aa2625708eaf96f310032391ff37746310ef) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c index d88523568b62..569c5a89ff10 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c @@ -68,8 +68,11 @@ int amdgpu_pasid_alloc(unsigned int bits) return -EINVAL; spin_lock(&amdgpu_pasid_idr_lock); + /* TODO: Need to replace the idr with an xarry, and then + * handle the internal locking with ATOMIC safe paths. + */ pasid = idr_alloc_cyclic(&amdgpu_pasid_idr, NULL, 1, - 1U << bits, GFP_KERNEL); + 1U << bits, GFP_ATOMIC); spin_unlock(&amdgpu_pasid_idr_lock); if (pasid >= 0) -- cgit v1.2.3 From 68484a648ade555842e0c75a392f3572b3d51998 Mon Sep 17 00:00:00 2001 From: Donet Tom Date: Mon, 23 Mar 2026 09:58:35 +0530 Subject: drm/amdkfd: Align expected_queue_size to PAGE_SIZE The AQL queue size can be 4K, but the minimum buffer object (BO) allocation size is PAGE_SIZE. On systems with a page size larger than 4K, the expected queue size does not match the allocated BO size, causing queue creation to fail. Align the expected queue size to PAGE_SIZE so that it matches the allocated BO size and allows queue creation to succeed. Reviewed-by: Felix Kuehling Signed-off-by: Donet Tom Signed-off-by: Alex Deucher (cherry picked from commit b01cd158a2f5230b137396c5f8cda3fc780abbc2) --- drivers/gpu/drm/amd/amdkfd/kfd_queue.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c index bbe869ceae3f..e1a922bb2ab7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c @@ -249,10 +249,10 @@ int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_prope topo_dev->node_props.gfx_target_version < 90000) /* metadata_queue_size not supported on GFX7/GFX8 */ expected_queue_size = - properties->queue_size / 2; + PAGE_ALIGN(properties->queue_size / 2); else expected_queue_size = - properties->queue_size + properties->metadata_queue_size; + PAGE_ALIGN(properties->queue_size + properties->metadata_queue_size); vm = drm_priv_to_vm(pdd->drm_priv); err = amdgpu_bo_reserve(vm->root.bo, false); -- cgit v1.2.3 From 6caeace0d1471b33bb43b58893940cc90baca5b9 Mon Sep 17 00:00:00 2001 From: Donet Tom Date: Mon, 23 Mar 2026 09:58:38 +0530 Subject: drm/amd: Fix MQD and control stack alignment for non-4K MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For gfxV9, due to a hardware bug ("based on the comments in the code here [1]"), the control stack of a user-mode compute queue must be allocated immediately after the page boundary of its regular MQD buffer. To handle this, we allocate an enlarged MQD buffer where the first page is used as the MQD and the remaining pages store the control stack. Although these regions share the same BO, they require different memory types: the MQD must be UC (uncached), while the control stack must be NC (non-coherent), matching the behavior when the control stack is allocated in user space. This logic works correctly on systems where the CPU page size matches the GPU page size (4K). However, the current implementation aligns both the MQD and the control stack to the CPU PAGE_SIZE. On systems with a larger CPU page size, the entire first CPU page is marked UC—even though that page may contain multiple GPU pages. The GPU treats the second 4K GPU page inside that CPU page as part of the control stack, but it is incorrectly mapped as UC. This patch fixes the issue by aligning both the MQD and control stack sizes to the GPU page size (4K). The first 4K page is correctly marked as UC for the MQD, and the remaining GPU pages are marked NC for the control stack. This ensures proper memory type assignment on systems with larger CPU page sizes. [1]: https://elixir.bootlin.com/linux/v6.18/source/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c#L118 Acked-by: Felix Kuehling Signed-off-by: Donet Tom Signed-off-by: Alex Deucher (cherry picked from commit 998d6781410de1c4b787fdbf6c56e851ea7fa553) --- drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c | 44 +++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 16 ++------- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c | 23 ++++++++----- 4 files changed, 64 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c index e2d32c29668a..bc772ca3dab7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c @@ -403,6 +403,50 @@ void amdgpu_gart_map_vram_range(struct amdgpu_device *adev, uint64_t pa, drm_dev_exit(idx); } +/** + * amdgpu_gart_map_gfx9_mqd - map mqd and ctrl_stack dma_addresses into GART entries + * + * @adev: amdgpu_device pointer + * @offset: offset into the GPU's gart aperture + * @pages: number of pages to bind + * @dma_addr: DMA addresses of pages + * @flags: page table entry flags + * + * Map the MQD and control stack addresses into GART entries with the correct + * memory types on gfxv9. The MQD occupies the first 4KB and is followed by + * the control stack. The MQD uses UC (uncached) memory, while the control stack + * uses NC (non-coherent) memory. + */ +void amdgpu_gart_map_gfx9_mqd(struct amdgpu_device *adev, uint64_t offset, + int pages, dma_addr_t *dma_addr, uint64_t flags) +{ + uint64_t page_base; + unsigned int i, j, t; + int idx; + uint64_t ctrl_flags = AMDGPU_PTE_MTYPE_VG10(flags, AMDGPU_MTYPE_NC); + void *dst; + + if (!adev->gart.ptr) + return; + + if (!drm_dev_enter(adev_to_drm(adev), &idx)) + return; + + t = offset / AMDGPU_GPU_PAGE_SIZE; + dst = adev->gart.ptr; + for (i = 0; i < pages; i++) { + page_base = dma_addr[i]; + for (j = 0; j < AMDGPU_GPU_PAGES_IN_CPU_PAGE; j++, t++) { + if ((i == 0) && (j == 0)) + amdgpu_gmc_set_pte_pde(adev, dst, t, page_base, flags); + else + amdgpu_gmc_set_pte_pde(adev, dst, t, page_base, ctrl_flags); + page_base += AMDGPU_GPU_PAGE_SIZE; + } + } + drm_dev_exit(idx); +} + /** * amdgpu_gart_bind - bind pages into the gart page table * diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h index d3118275ddae..6ebd2da32ea6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h @@ -62,6 +62,8 @@ void amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset, void amdgpu_gart_map(struct amdgpu_device *adev, uint64_t offset, int pages, dma_addr_t *dma_addr, uint64_t flags, void *dst); +void amdgpu_gart_map_gfx9_mqd(struct amdgpu_device *adev, uint64_t offset, + int pages, dma_addr_t *dma_addr, uint64_t flags); void amdgpu_gart_bind(struct amdgpu_device *adev, uint64_t offset, int pages, dma_addr_t *dma_addr, uint64_t flags); void amdgpu_gart_map_vram_range(struct amdgpu_device *adev, uint64_t pa, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index eeaa56c8d129..0ccb31788b20 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -853,25 +853,15 @@ static void amdgpu_ttm_gart_bind_gfx9_mqd(struct amdgpu_device *adev, int num_xcc = max(1U, adev->gfx.num_xcc_per_xcp); uint64_t page_idx, pages_per_xcc; int i; - uint64_t ctrl_flags = AMDGPU_PTE_MTYPE_VG10(flags, AMDGPU_MTYPE_NC); pages_per_xcc = total_pages; do_div(pages_per_xcc, num_xcc); for (i = 0, page_idx = 0; i < num_xcc; i++, page_idx += pages_per_xcc) { - /* MQD page: use default flags */ - amdgpu_gart_bind(adev, + amdgpu_gart_map_gfx9_mqd(adev, gtt->offset + (page_idx << PAGE_SHIFT), - 1, >t->ttm.dma_address[page_idx], flags); - /* - * Ctrl pages - modify the memory type to NC (ctrl_flags) from - * the second page of the BO onward. - */ - amdgpu_gart_bind(adev, - gtt->offset + ((page_idx + 1) << PAGE_SHIFT), - pages_per_xcc - 1, - >t->ttm.dma_address[page_idx + 1], - ctrl_flags); + pages_per_xcc, >t->ttm.dma_address[page_idx], + flags); } } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c index d5c234f30e8d..a535f151cb5f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c @@ -42,9 +42,16 @@ static uint64_t mqd_stride_v9(struct mqd_manager *mm, struct queue_properties *q) { if (mm->dev->kfd->cwsr_enabled && - q->type == KFD_QUEUE_TYPE_COMPUTE) - return ALIGN(q->ctl_stack_size, PAGE_SIZE) + - ALIGN(sizeof(struct v9_mqd), PAGE_SIZE); + q->type == KFD_QUEUE_TYPE_COMPUTE) { + + /* On gfxv9, the MQD resides in the first 4K page, + * followed by the control stack. Align both to + * AMDGPU_GPU_PAGE_SIZE to maintain the required 4K boundary. + */ + + return ALIGN(ALIGN(q->ctl_stack_size, AMDGPU_GPU_PAGE_SIZE) + + ALIGN(sizeof(struct v9_mqd), AMDGPU_GPU_PAGE_SIZE), PAGE_SIZE); + } return mm->mqd_size; } @@ -151,8 +158,8 @@ static struct kfd_mem_obj *allocate_mqd(struct mqd_manager *mm, if (!mqd_mem_obj) return NULL; retval = amdgpu_amdkfd_alloc_kernel_mem(node->adev, - (ALIGN(q->ctl_stack_size, PAGE_SIZE) + - ALIGN(sizeof(struct v9_mqd), PAGE_SIZE)) * + (ALIGN(ALIGN(q->ctl_stack_size, AMDGPU_GPU_PAGE_SIZE) + + ALIGN(sizeof(struct v9_mqd), AMDGPU_GPU_PAGE_SIZE), PAGE_SIZE)) * NUM_XCC(node->xcc_mask), mqd_on_vram(node->adev) ? AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT, @@ -360,7 +367,7 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd, struct kfd_context_save_area_header header; /* Control stack is located one page after MQD. */ - void *mqd_ctl_stack = (void *)((uintptr_t)mqd + PAGE_SIZE); + void *mqd_ctl_stack = (void *)((uintptr_t)mqd + AMDGPU_GPU_PAGE_SIZE); m = get_mqd(mqd); @@ -397,7 +404,7 @@ static void checkpoint_mqd(struct mqd_manager *mm, void *mqd, void *mqd_dst, voi { struct v9_mqd *m; /* Control stack is located one page after MQD. */ - void *ctl_stack = (void *)((uintptr_t)mqd + PAGE_SIZE); + void *ctl_stack = (void *)((uintptr_t)mqd + AMDGPU_GPU_PAGE_SIZE); m = get_mqd(mqd); @@ -443,7 +450,7 @@ static void restore_mqd(struct mqd_manager *mm, void **mqd, *gart_addr = addr; /* Control stack is located one page after MQD. */ - ctl_stack = (void *)((uintptr_t)*mqd + PAGE_SIZE); + ctl_stack = (void *)((uintptr_t)*mqd + AMDGPU_GPU_PAGE_SIZE); memcpy(ctl_stack, ctl_stack_src, ctl_stack_size); m->cp_hqd_pq_doorbell_control = -- cgit v1.2.3 From ced5c30e47d1cd52d6ae40f809223a6286854086 Mon Sep 17 00:00:00 2001 From: Junrui Luo Date: Sat, 14 Mar 2026 23:33:53 +0800 Subject: drm/amdgpu/userq: fix memory leak in MQD creation error paths In mes_userq_mqd_create(), the memdup_user() allocations for IP-specific MQD structs are not freed when subsequent VA validation fails. The goto free_mqd label only cleans up the MQD BO object and userq_props. Fix by adding kfree() before each goto free_mqd on VA validation failure in the COMPUTE, GFX, and SDMA branches. Fixes: 9e46b8bb0539 ("drm/amdgpu: validate userq buffer virtual address and size") Reported-by: Yuhao Jiang Signed-off-by: Junrui Luo Reviewed-by: Prike Liang Signed-off-by: Alex Deucher (cherry picked from commit 27f5ff9e4a4150d7cf8b4085aedd3b77ddcc5d08) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/mes_userqueue.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c index 8c74894254f7..faac21ee5739 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c @@ -324,8 +324,10 @@ static int mes_userq_mqd_create(struct amdgpu_usermode_queue *queue, r = amdgpu_userq_input_va_validate(adev, queue, compute_mqd->eop_va, 2048); - if (r) + if (r) { + kfree(compute_mqd); goto free_mqd; + } userq_props->eop_gpu_addr = compute_mqd->eop_va; userq_props->hqd_pipe_priority = AMDGPU_GFX_PIPE_PRIO_NORMAL; @@ -365,12 +367,16 @@ static int mes_userq_mqd_create(struct amdgpu_usermode_queue *queue, r = amdgpu_userq_input_va_validate(adev, queue, mqd_gfx_v11->shadow_va, shadow_info.shadow_size); - if (r) + if (r) { + kfree(mqd_gfx_v11); goto free_mqd; + } r = amdgpu_userq_input_va_validate(adev, queue, mqd_gfx_v11->csa_va, shadow_info.csa_size); - if (r) + if (r) { + kfree(mqd_gfx_v11); goto free_mqd; + } kfree(mqd_gfx_v11); } else if (queue->queue_type == AMDGPU_HW_IP_DMA) { @@ -390,8 +396,10 @@ static int mes_userq_mqd_create(struct amdgpu_usermode_queue *queue, } r = amdgpu_userq_input_va_validate(adev, queue, mqd_sdma_v11->csa_va, 32); - if (r) + if (r) { + kfree(mqd_sdma_v11); goto free_mqd; + } userq_props->csa_addr = mqd_sdma_v11->csa_va; kfree(mqd_sdma_v11); -- cgit v1.2.3 From 4487571ef17a30d274600b3bd6965f497a881299 Mon Sep 17 00:00:00 2001 From: Donet Tom Date: Thu, 26 Mar 2026 17:51:28 +0530 Subject: drm/amdgpu: Change AMDGPU_VA_RESERVED_TRAP_SIZE to 64KB MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, AMDGPU_VA_RESERVED_TRAP_SIZE is hardcoded to 8KB, while KFD_CWSR_TBA_TMA_SIZE is defined as 2 * PAGE_SIZE. On systems with 4K pages, both values match (8KB), so allocation and reserved space are consistent. However, on 64K page-size systems, KFD_CWSR_TBA_TMA_SIZE becomes 128KB, while the reserved trap area remains 8KB. This mismatch causes the kernel to crash when running rocminfo or rccl unit tests. Kernel attempted to read user page (2) - exploit attempt? (uid: 1001) BUG: Kernel NULL pointer dereference on read at 0x00000002 Faulting instruction address: 0xc0000000002c8a64 Oops: Kernel access of bad area, sig: 11 [#1] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries CPU: 34 UID: 1001 PID: 9379 Comm: rocminfo Tainted: G E 6.19.0-rc4-amdgpu-00320-gf23176405700 #56 VOLUNTARY Tainted: [E]=UNSIGNED_MODULE Hardware name: IBM,9105-42A POWER10 (architected) 0x800200 0xf000006 of:IBM,FW1060.30 (ML1060_896) hv:phyp pSeries NIP: c0000000002c8a64 LR: c00000000125dbc8 CTR: c00000000125e730 REGS: c0000001e0957580 TRAP: 0300 Tainted: G E MSR: 8000000000009033 CR: 24008268 XER: 00000036 CFAR: c00000000125dbc4 DAR: 0000000000000002 DSISR: 40000000 IRQMASK: 1 GPR00: c00000000125d908 c0000001e0957820 c0000000016e8100 c00000013d814540 GPR04: 0000000000000002 c00000013d814550 0000000000000045 0000000000000000 GPR08: c00000013444d000 c00000013d814538 c00000013d814538 0000000084002268 GPR12: c00000000125e730 c000007e2ffd5f00 ffffffffffffffff 0000000000020000 GPR16: 0000000000000000 0000000000000002 c00000015f653000 0000000000000000 GPR20: c000000138662400 c00000013d814540 0000000000000000 c00000013d814500 GPR24: 0000000000000000 0000000000000002 c0000001e0957888 c0000001e0957878 GPR28: c00000013d814548 0000000000000000 c00000013d814540 c0000001e0957888 NIP [c0000000002c8a64] __mutex_add_waiter+0x24/0xc0 LR [c00000000125dbc8] __mutex_lock.constprop.0+0x318/0xd00 Call Trace: 0xc0000001e0957890 (unreliable) __mutex_lock.constprop.0+0x58/0xd00 amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu+0x6fc/0xb60 [amdgpu] kfd_process_alloc_gpuvm+0x54/0x1f0 [amdgpu] kfd_process_device_init_cwsr_dgpu+0xa4/0x1a0 [amdgpu] kfd_process_device_init_vm+0xd8/0x2e0 [amdgpu] kfd_ioctl_acquire_vm+0xd0/0x130 [amdgpu] kfd_ioctl+0x514/0x670 [amdgpu] sys_ioctl+0x134/0x180 system_call_exception+0x114/0x300 system_call_vectored_common+0x15c/0x2ec This patch changes AMDGPU_VA_RESERVED_TRAP_SIZE to 64 KB and KFD_CWSR_TBA_TMA_SIZE to the AMD GPU page size. This means we reserve 64 KB for the trap in the address space, but only allocate 8 KB within it. With this approach, the allocation size never exceeds the reserved area. Fixes: 34a1de0f7935 ("drm/amdkfd: Relocate TBA/TMA to opposite side of VM hole") Reviewed-by: Christian König Suggested-by: Felix Kuehling Suggested-by: Christian König Signed-off-by: Donet Tom Signed-off-by: Alex Deucher (cherry picked from commit 31b8de5e55666f26ea7ece5f412b83eab3f56dbb) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index bb276c0ad06d..d5b7061556ba 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -173,7 +173,7 @@ struct amdgpu_bo_vm; #define AMDGPU_VA_RESERVED_SEQ64_SIZE (2ULL << 20) #define AMDGPU_VA_RESERVED_SEQ64_START(adev) (AMDGPU_VA_RESERVED_CSA_START(adev) \ - AMDGPU_VA_RESERVED_SEQ64_SIZE) -#define AMDGPU_VA_RESERVED_TRAP_SIZE (2ULL << 12) +#define AMDGPU_VA_RESERVED_TRAP_SIZE (1ULL << 16) #define AMDGPU_VA_RESERVED_TRAP_START(adev) (AMDGPU_VA_RESERVED_SEQ64_START(adev) \ - AMDGPU_VA_RESERVED_TRAP_SIZE) #define AMDGPU_VA_RESERVED_BOTTOM (1ULL << 16) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index e5b56412931b..035687a17d89 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -102,8 +102,8 @@ * The first chunk is the TBA used for the CWSR ISA code. The second * chunk is used as TMA for user-mode trap handler setup in daisy-chain mode. */ -#define KFD_CWSR_TBA_TMA_SIZE (PAGE_SIZE * 2) -#define KFD_CWSR_TMA_OFFSET (PAGE_SIZE + 2048) +#define KFD_CWSR_TBA_TMA_SIZE (AMDGPU_GPU_PAGE_SIZE * 2) +#define KFD_CWSR_TMA_OFFSET (AMDGPU_GPU_PAGE_SIZE + 2048) #define KFD_MAX_NUM_OF_QUEUES_PER_DEVICE \ (KFD_MAX_NUM_OF_PROCESSES * \ -- cgit v1.2.3 From e927b36ae18b66b49219eaa9f46edc7b4fdbb25e Mon Sep 17 00:00:00 2001 From: Srinivasan Shanmugam Date: Sat, 21 Mar 2026 17:25:14 +0530 Subject: drm/amd/display: Fix NULL pointer dereference in dcn401_init_hw() dcn401_init_hw() assumes that update_bw_bounding_box() is valid when entering the update path. However, the existing condition: ((!fams2_enable && update_bw_bounding_box) || freq_changed) does not guarantee this, as the freq_changed branch can evaluate to true independently of the callback pointer. This can result in calling update_bw_bounding_box() when it is NULL. Fix this by separating the update condition from the pointer checks and ensuring the callback, dc->clk_mgr, and bw_params are validated before use. Fixes the below: ../dc/hwss/dcn401/dcn401_hwseq.c:367 dcn401_init_hw() error: we previously assumed 'dc->res_pool->funcs->update_bw_bounding_box' could be null (see line 362) Fixes: ca0fb243c3bb ("drm/amd/display: Underflow Seen on DCN401 eGPU") Cc: Daniel Sa Cc: Alvin Lee Cc: Roman Li Cc: Alex Hung Cc: Tom Chung Cc: Dan Carpenter Cc: Aurabindo Pillai Signed-off-by: Srinivasan Shanmugam Reviewed-by: Alex Hung Signed-off-by: Alex Deucher (cherry picked from commit 86117c5ab42f21562fedb0a64bffea3ee5fcd477) Cc: stable@vger.kernel.org --- .../gpu/drm/amd/display/dc/hwss/dcn401/dcn401_hwseq.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn401/dcn401_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn401/dcn401_hwseq.c index eb198d52a115..4dfb6c865831 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn401/dcn401_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn401/dcn401_hwseq.c @@ -147,6 +147,7 @@ void dcn401_init_hw(struct dc *dc) int edp_num; uint32_t backlight = MAX_BACKLIGHT_LEVEL; uint32_t user_level = MAX_BACKLIGHT_LEVEL; + bool dchub_ref_freq_changed; int current_dchub_ref_freq = 0; if (dc->clk_mgr && dc->clk_mgr->funcs && dc->clk_mgr->funcs->init_clocks) { @@ -360,14 +361,18 @@ void dcn401_init_hw(struct dc *dc) dc->caps.dmub_caps.psr = dc->ctx->dmub_srv->dmub->feature_caps.psr; dc->caps.dmub_caps.mclk_sw = dc->ctx->dmub_srv->dmub->feature_caps.fw_assisted_mclk_switch_ver > 0; dc->caps.dmub_caps.fams_ver = dc->ctx->dmub_srv->dmub->feature_caps.fw_assisted_mclk_switch_ver; + + /* sw and fw FAMS versions must match for support */ dc->debug.fams2_config.bits.enable &= - dc->caps.dmub_caps.fams_ver == dc->debug.fams_version.ver; // sw & fw fams versions must match for support - if ((!dc->debug.fams2_config.bits.enable && dc->res_pool->funcs->update_bw_bounding_box) - || res_pool->ref_clocks.dchub_ref_clock_inKhz / 1000 != current_dchub_ref_freq) { + dc->caps.dmub_caps.fams_ver == dc->debug.fams_version.ver; + dchub_ref_freq_changed = + res_pool->ref_clocks.dchub_ref_clock_inKhz / 1000 != current_dchub_ref_freq; + if ((!dc->debug.fams2_config.bits.enable || dchub_ref_freq_changed) && + dc->res_pool->funcs->update_bw_bounding_box && + dc->clk_mgr && dc->clk_mgr->bw_params) { /* update bounding box if FAMS2 disabled, or if dchub clk has changed */ - if (dc->clk_mgr) - dc->res_pool->funcs->update_bw_bounding_box(dc, - dc->clk_mgr->bw_params); + dc->res_pool->funcs->update_bw_bounding_box(dc, + dc->clk_mgr->bw_params); } } } -- cgit v1.2.3 From daf470b8882b6f7f53cbfe9ec2b93a1b21528cdc Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Fri, 27 Mar 2026 14:29:17 +0530 Subject: drm/amdgpu: Fix wait after reset sequence in S4 For a mode-1 reset done at the end of S4 on PSPv11 dGPUs, only check if TOS is unloaded. Fixes: 32f73741d6ee ("drm/amdgpu: Wait for bootloader after PSPv11 reset") Closes: https://gitlab.freedesktop.org/drm/amd/-/work_items/4853 Signed-off-by: Lijo Lazar Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher (cherry picked from commit 2fb4883b884a437d760bd7bdf7695a7e5a60bba3) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 8 ++++++-- drivers/gpu/drm/amd/amdgpu/psp_v11_0.c | 3 ++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 95d26f086d54..c91638e65174 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -2703,8 +2703,12 @@ static int amdgpu_pmops_freeze(struct device *dev) if (r) return r; - if (amdgpu_acpi_should_gpu_reset(adev)) - return amdgpu_asic_reset(adev); + if (amdgpu_acpi_should_gpu_reset(adev)) { + amdgpu_device_lock_reset_domain(adev->reset_domain); + r = amdgpu_asic_reset(adev); + amdgpu_device_unlock_reset_domain(adev->reset_domain); + return r; + } return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c index 9aa988982304..fb7aaf5ae05c 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c @@ -170,7 +170,8 @@ static int psp_v11_0_wait_for_bootloader(struct psp_context *psp) int retry_loop; /* For a reset done at the end of S3, only wait for TOS to be unloaded */ - if (adev->in_s3 && !(adev->flags & AMD_IS_APU) && amdgpu_in_reset(adev)) + if ((adev->in_s4 || adev->in_s3) && !(adev->flags & AMD_IS_APU) && + amdgpu_in_reset(adev)) return psp_v11_wait_for_tos_unload(psp); for (retry_loop = 0; retry_loop < 20; retry_loop++) { -- cgit v1.2.3 From 78746a474e92fc7aaed12219bec7c78ae1bd6156 Mon Sep 17 00:00:00 2001 From: Donet Tom Date: Mon, 23 Mar 2026 09:58:39 +0530 Subject: drm/amdkfd: Fix queue preemption/eviction failures by aligning control stack size to GPU page size The control stack size is calculated based on the number of CUs and waves, and is then aligned to PAGE_SIZE. When the resulting control stack size is aligned to 64 KB, GPU hangs and queue preemption failures are observed while running RCCL unit tests on systems with more than two GPUs. amdgpu 0048:0f:00.0: amdgpu: Queue preemption failed for queue with doorbell_id: 80030008 amdgpu 0048:0f:00.0: amdgpu: Failed to evict process queues amdgpu 0048:0f:00.0: amdgpu: GPU reset begin!. Source: 4 amdgpu 0048:0f:00.0: amdgpu: Queue preemption failed for queue with doorbell_id: 80030008 amdgpu 0048:0f:00.0: amdgpu: Failed to evict process queues amdgpu 0048:0f:00.0: amdgpu: Failed to restore process queues This issue is observed on both 4 KB and 64 KB system page-size configurations. This patch fixes the issue by aligning the control stack size to AMDGPU_GPU_PAGE_SIZE instead of PAGE_SIZE, so the control stack size will not be 64 KB on systems with a 64 KB page size and queue preemption works correctly. Additionally, In the current code, wg_data_size is aligned to PAGE_SIZE, which can waste memory if the system page size is large. In this patch, wg_data_size is aligned to AMDGPU_GPU_PAGE_SIZE. The cwsr_size, calculated from wg_data_size and the control stack size, is aligned to PAGE_SIZE. Reviewed-by: Felix Kuehling Signed-off-by: Donet Tom Signed-off-by: Alex Deucher (cherry picked from commit a3e14436304392fbada359edd0f1d1659850c9b7) --- drivers/gpu/drm/amd/amdkfd/kfd_queue.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c index e1a922bb2ab7..28354a4e5dd5 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c @@ -492,10 +492,11 @@ void kfd_queue_ctx_save_restore_size(struct kfd_topology_device *dev) cu_num = props->simd_count / props->simd_per_cu / NUM_XCC(dev->gpu->xcc_mask); wave_num = get_num_waves(props, gfxv, cu_num); - wg_data_size = ALIGN(cu_num * WG_CONTEXT_DATA_SIZE_PER_CU(gfxv, props), PAGE_SIZE); + wg_data_size = ALIGN(cu_num * WG_CONTEXT_DATA_SIZE_PER_CU(gfxv, props), + AMDGPU_GPU_PAGE_SIZE); ctl_stack_size = wave_num * CNTL_STACK_BYTES_PER_WAVE(gfxv) + 8; ctl_stack_size = ALIGN(SIZEOF_HSA_USER_CONTEXT_SAVE_AREA_HEADER + ctl_stack_size, - PAGE_SIZE); + AMDGPU_GPU_PAGE_SIZE); if ((gfxv / 10000 * 10000) == 100000) { /* HW design limits control stack size to 0x7000. @@ -507,7 +508,7 @@ void kfd_queue_ctx_save_restore_size(struct kfd_topology_device *dev) props->ctl_stack_size = ctl_stack_size; props->debug_memory_size = ALIGN(wave_num * DEBUGGER_BYTES_PER_WAVE, DEBUGGER_BYTES_ALIGN); - props->cwsr_size = ctl_stack_size + wg_data_size; + props->cwsr_size = ALIGN(ctl_stack_size + wg_data_size, PAGE_SIZE); if (gfxv == 80002) /* GFX_VERSION_TONGA */ props->eop_buffer_size = 0x8000; -- cgit v1.2.3