From 49abfa812617a7f2d0132c70d23ac98b389c6ec1 Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Mon, 23 Feb 2026 12:41:30 +0000 Subject: drm/amdgpu/userq: Fix reference leak in amdgpu_userq_wait_ioctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop reference to syncobj and timeline fence when aborting the ioctl due output array being too small. Reviewed-by: Alex Deucher Signed-off-by: Tvrtko Ursulin Fixes: a292fdecd728 ("drm/amdgpu: Implement userqueue signal/wait IOCTL") Cc: Arunpravin Paneer Selvam Cc: Christian König Cc: Alex Deucher Signed-off-by: Alex Deucher (cherry picked from commit 68951e9c3e6bb22396bc42ef2359751c8315dd27) Cc: # v6.16+ --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c index 8013260e29dc..9b9947b94b89 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c @@ -876,6 +876,7 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data, dma_fence_unwrap_for_each(f, &iter, fence) { if (WARN_ON_ONCE(num_fences >= wait_info->num_fences)) { r = -EINVAL; + dma_fence_put(fence); goto free_fences; } @@ -900,6 +901,7 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data, if (WARN_ON_ONCE(num_fences >= wait_info->num_fences)) { r = -EINVAL; + dma_fence_put(fence); goto free_fences; } -- cgit v1.2.3 From 7b7d7693a55d606d700beb9549c9f7f0e5d9c24f Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Mon, 23 Feb 2026 12:41:31 +0000 Subject: drm/amdgpu/userq: Do not allow userspace to trivially triger kernel warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Userspace can either deliberately pass in the too small num_fences, or the required number can legitimately grow between the two calls to the userq wait ioctl. In both cases we do not want the emit the kernel warning backtrace since nothing is wrong with the kernel and userspace will simply get an errno reported back. So lets simply drop the WARN_ONs. Reviewed-by: Alex Deucher Signed-off-by: Tvrtko Ursulin Fixes: a292fdecd728 ("drm/amdgpu: Implement userqueue signal/wait IOCTL") Cc: Arunpravin Paneer Selvam Cc: Christian König Cc: Alex Deucher Reviewed-by: Christian König Signed-off-by: Alex Deucher (cherry picked from commit 2c333ea579de6cc20ea7bc50e9595ef72863e65c) --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c index 9b9947b94b89..d972dc46f5a8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c @@ -833,7 +833,7 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data, dma_resv_for_each_fence(&resv_cursor, gobj_read[i]->resv, DMA_RESV_USAGE_READ, fence) { - if (WARN_ON_ONCE(num_fences >= wait_info->num_fences)) { + if (num_fences >= wait_info->num_fences) { r = -EINVAL; goto free_fences; } @@ -850,7 +850,7 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data, dma_resv_for_each_fence(&resv_cursor, gobj_write[i]->resv, DMA_RESV_USAGE_WRITE, fence) { - if (WARN_ON_ONCE(num_fences >= wait_info->num_fences)) { + if (num_fences >= wait_info->num_fences) { r = -EINVAL; goto free_fences; } @@ -874,7 +874,7 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data, goto free_fences; dma_fence_unwrap_for_each(f, &iter, fence) { - if (WARN_ON_ONCE(num_fences >= wait_info->num_fences)) { + if (num_fences >= wait_info->num_fences) { r = -EINVAL; dma_fence_put(fence); goto free_fences; @@ -899,7 +899,7 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data, if (r) goto free_fences; - if (WARN_ON_ONCE(num_fences >= wait_info->num_fences)) { + if (num_fences >= wait_info->num_fences) { r = -EINVAL; dma_fence_put(fence); goto free_fences; -- cgit v1.2.3 From ea78f8c68f4f6211c557df49174c54d167821962 Mon Sep 17 00:00:00 2001 From: Sunil Khatri Date: Fri, 20 Feb 2026 13:47:58 +0530 Subject: drm/amdgpu: add upper bound check on user inputs in signal ioctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Huge input values in amdgpu_userq_signal_ioctl can lead to a OOM and could be exploited. So check these input value against AMDGPU_USERQ_MAX_HANDLES which is big enough value for genuine use cases and could potentially avoid OOM. Signed-off-by: Sunil Khatri Reviewed-by: Christian König Signed-off-by: Alex Deucher (cherry picked from commit be267e15f99bc97cbe202cd556717797cdcf79a5) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c index d972dc46f5a8..c5f5af20af75 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c @@ -35,6 +35,8 @@ static const struct dma_fence_ops amdgpu_userq_fence_ops; static struct kmem_cache *amdgpu_userq_fence_slab; +#define AMDGPU_USERQ_MAX_HANDLES (1U << 16) + int amdgpu_userq_fence_slab_init(void) { amdgpu_userq_fence_slab = kmem_cache_create("amdgpu_userq_fence", @@ -478,6 +480,11 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data, if (!amdgpu_userq_enabled(dev)) return -ENOTSUPP; + if (args->num_syncobj_handles > AMDGPU_USERQ_MAX_HANDLES || + args->num_bo_write_handles > AMDGPU_USERQ_MAX_HANDLES || + args->num_bo_read_handles > AMDGPU_USERQ_MAX_HANDLES) + return -EINVAL; + num_syncobj_handles = args->num_syncobj_handles; syncobj_handles = memdup_user(u64_to_user_ptr(args->syncobj_handles), size_mul(sizeof(u32), num_syncobj_handles)); -- cgit v1.2.3 From 64ac7c09fc44985ec9bb6a9db740899fa40ca613 Mon Sep 17 00:00:00 2001 From: Sunil Khatri Date: Tue, 24 Feb 2026 12:13:09 +0530 Subject: drm/amdgpu: add upper bound check on user inputs in wait ioctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Huge input values in amdgpu_userq_wait_ioctl can lead to a OOM and could be exploited. So check these input value against AMDGPU_USERQ_MAX_HANDLES which is big enough value for genuine use cases and could potentially avoid OOM. v2: squash in Srini's fix Signed-off-by: Sunil Khatri Reviewed-by: Christian König Signed-off-by: Alex Deucher (cherry picked from commit fcec012c664247531aed3e662f4280ff804d1476) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c index c5f5af20af75..7e9cf1868cc9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c @@ -671,6 +671,11 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data, if (!amdgpu_userq_enabled(dev)) return -ENOTSUPP; + if (wait_info->num_syncobj_handles > AMDGPU_USERQ_MAX_HANDLES || + wait_info->num_bo_write_handles > AMDGPU_USERQ_MAX_HANDLES || + wait_info->num_bo_read_handles > AMDGPU_USERQ_MAX_HANDLES) + return -EINVAL; + num_read_bo_handles = wait_info->num_bo_read_handles; bo_handles_read = memdup_user(u64_to_user_ptr(wait_info->bo_read_handles), size_mul(sizeof(u32), num_read_bo_handles)); -- cgit v1.2.3 From 5e0bcc7b88bcd081aaae6f481b10d9ab294fcb69 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 23 Feb 2026 14:00:07 -0800 Subject: drm/amdgpu: Unlock a mutex before destroying it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mutexes must be unlocked before these are destroyed. This has been detected by the Clang thread-safety analyzer. Cc: Alex Deucher Cc: Christian König Cc: Yang Wang Cc: Hawking Zhang Cc: amd-gfx@lists.freedesktop.org Fixes: f5e4cc8461c4 ("drm/amdgpu: implement RAS ACA driver framework") Reviewed-by: Yang Wang Acked-by: Christian König Signed-off-by: Bart Van Assche Signed-off-by: Alex Deucher (cherry picked from commit 270258ba320beb99648dceffb67e86ac76786e55) --- drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c index afe5ca81beec..db7858fe0c3d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c @@ -641,6 +641,7 @@ static void aca_error_fini(struct aca_error *aerr) aca_bank_error_remove(aerr, bank_error); out_unlock: + mutex_unlock(&aerr->lock); mutex_destroy(&aerr->lock); } -- cgit v1.2.3 From 480ad5f6ead4a47b969aab6618573cd6822bb6a4 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 23 Feb 2026 13:50:23 -0800 Subject: drm/amdgpu: Fix locking bugs in error paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Do not unlock psp->ras_context.mutex if it has not been locked. This has been detected by the Clang thread-safety analyzer. Cc: Alex Deucher Cc: Christian König Cc: YiPeng Chai Cc: Hawking Zhang Cc: amd-gfx@lists.freedesktop.org Fixes: b3fb79cda568 ("drm/amdgpu: add mutex to protect ras shared memory") Acked-by: Christian König Signed-off-by: Bart Van Assche Signed-off-by: Alex Deucher (cherry picked from commit 6fa01b4335978051d2cd80841728fd63cc597970) --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c index 6e8aad91bcd3..0d3c18f04ac3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c @@ -332,13 +332,13 @@ static ssize_t ta_if_invoke_debugfs_write(struct file *fp, const char *buf, size if (!context || !context->initialized) { dev_err(adev->dev, "TA is not initialized\n"); ret = -EINVAL; - goto err_free_shared_buf; + goto free_shared_buf; } if (!psp->ta_funcs || !psp->ta_funcs->fn_ta_invoke) { dev_err(adev->dev, "Unsupported function to invoke TA\n"); ret = -EOPNOTSUPP; - goto err_free_shared_buf; + goto free_shared_buf; } context->session_id = ta_id; @@ -346,7 +346,7 @@ static ssize_t ta_if_invoke_debugfs_write(struct file *fp, const char *buf, size mutex_lock(&psp->ras_context.mutex); ret = prep_ta_mem_context(&context->mem_context, shared_buf, shared_buf_len); if (ret) - goto err_free_shared_buf; + goto unlock; ret = psp_fn_ta_invoke(psp, cmd_id); if (ret || context->resp_status) { @@ -354,15 +354,17 @@ static ssize_t ta_if_invoke_debugfs_write(struct file *fp, const char *buf, size ret, context->resp_status); if (!ret) { ret = -EINVAL; - goto err_free_shared_buf; + goto unlock; } } if (copy_to_user((char *)&buf[copy_pos], context->mem_context.shared_buf, shared_buf_len)) ret = -EFAULT; -err_free_shared_buf: +unlock: mutex_unlock(&psp->ras_context.mutex); + +free_shared_buf: kfree(shared_buf); return ret; -- cgit v1.2.3 From a5fe1a54513196e4bc8f9170006057dc31e7155e Mon Sep 17 00:00:00 2001 From: sguttula Date: Sat, 21 Feb 2026 10:03:32 +0530 Subject: drm/amdgpu/vcn5: Add SMU dpm interface type This will set AMDGPU_VCN_SMU_DPM_INTERFACE_* smu_type based on soc type and fixing ring timeout issue seen for DPM enabled case. Signed-off-by: sguttula Reviewed-by: Pratik Vishwakarma Signed-off-by: Alex Deucher (cherry picked from commit f0f23c315b38c55e8ce9484cf59b65811f350630) --- drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c index 0202df5db1e1..6109124f852e 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c @@ -174,6 +174,10 @@ static int vcn_v5_0_0_sw_init(struct amdgpu_ip_block *ip_block) fw_shared->present_flag_0 = cpu_to_le32(AMDGPU_FW_SHARED_FLAG_0_UNIFIED_QUEUE); fw_shared->sq.is_enabled = 1; + fw_shared->present_flag_0 |= cpu_to_le32(AMDGPU_VCN_SMU_DPM_INTERFACE_FLAG); + fw_shared->smu_dpm_interface.smu_interface_type = (adev->flags & AMD_IS_APU) ? + AMDGPU_VCN_SMU_DPM_INTERFACE_APU : AMDGPU_VCN_SMU_DPM_INTERFACE_DGPU; + if (amdgpu_vcnfw_log) amdgpu_vcn_fwlog_init(&adev->vcn.inst[i]); -- cgit v1.2.3 From b57c4ec98c17789136a4db948aec6daadceb5024 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Tue, 24 Feb 2026 10:18:51 +0530 Subject: drm/amdgpu: Fix error handling in slot reset If the device has not recovered after slot reset is called, it goes to out label for error handling. There it could make decision based on uninitialized hive pointer and could result in accessing an uninitialized list. Initialize the list and hive properly so that it handles the error situation and also releases the reset domain lock which is acquired during error_detected callback. Fixes: 732c6cefc1ec ("drm/amdgpu: Replace tmp_adev with hive in amdgpu_pci_slot_reset") Signed-off-by: Lijo Lazar Reviewed-by: Ce Sun Signed-off-by: Alex Deucher (cherry picked from commit bb71362182e59caa227e4192da5a612b09349696) --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index d9789e0b5201..3e19b51a2763 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -7059,6 +7059,15 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) dev_info(adev->dev, "PCI error: slot reset callback!!\n"); memset(&reset_context, 0, sizeof(reset_context)); + INIT_LIST_HEAD(&device_list); + hive = amdgpu_get_xgmi_hive(adev); + if (hive) { + mutex_lock(&hive->hive_lock); + list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) + list_add_tail(&tmp_adev->reset_list, &device_list); + } else { + list_add_tail(&adev->reset_list, &device_list); + } if (adev->pcie_reset_ctx.swus) link_dev = adev->pcie_reset_ctx.swus; @@ -7099,19 +7108,13 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) reset_context.reset_req_dev = adev; set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); - INIT_LIST_HEAD(&device_list); - hive = amdgpu_get_xgmi_hive(adev); if (hive) { - mutex_lock(&hive->hive_lock); reset_context.hive = hive; - list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { + list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) tmp_adev->pcie_reset_ctx.in_link_reset = true; - list_add_tail(&tmp_adev->reset_list, &device_list); - } } else { set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); - list_add_tail(&adev->reset_list, &device_list); } r = amdgpu_device_asic_reset(adev, &device_list, &reset_context); -- cgit v1.2.3 From 6b0d812971370c64b837a2db4275410f478272fe Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 25 Feb 2026 10:51:16 -0600 Subject: drm/amd: Disable MES LR compute W/A A workaround was introduced in commit 1fb710793ce2 ("drm/amdgpu: Enable MES lr_compute_wa by default") to help with some hangs observed in gfx1151. This WA didn't fully fix the issue. It was actually fixed by adjusting the VGPR size to the correct value that matched the hardware in commit b42f3bf9536c ("drm/amdkfd: bump minimum vgpr size for gfx1151"). There are reports of instability on other products with newer GC microcode versions, and I believe they're caused by this workaround. As we don't need the workaround any more, remove it. Fixes: b42f3bf9536c ("drm/amdkfd: bump minimum vgpr size for gfx1151") Acked-by: Alex Deucher Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher (cherry picked from commit 9973e64bd6ee7642860a6f3b6958cbf14e89cabd) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 5 ----- drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 5 ----- 2 files changed, 10 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index 09ebb13ca5e8..a926a330700e 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -720,11 +720,6 @@ static int mes_v11_0_set_hw_resources(struct amdgpu_mes *mes) mes_set_hw_res_pkt.enable_reg_active_poll = 1; mes_set_hw_res_pkt.enable_level_process_quantum_check = 1; mes_set_hw_res_pkt.oversubscription_timer = 50; - if ((mes->adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 0x7f) - mes_set_hw_res_pkt.enable_lr_compute_wa = 1; - else - dev_info_once(mes->adev->dev, - "MES FW version must be >= 0x7f to enable LR compute workaround.\n"); if (amdgpu_mes_log_enable) { mes_set_hw_res_pkt.enable_mes_event_int_logging = 1; diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c index b1c864dc79a8..5bfa5d1d0b36 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c @@ -779,11 +779,6 @@ static int mes_v12_0_set_hw_resources(struct amdgpu_mes *mes, int pipe) mes_set_hw_res_pkt.use_different_vmid_compute = 1; mes_set_hw_res_pkt.enable_reg_active_poll = 1; mes_set_hw_res_pkt.enable_level_process_quantum_check = 1; - if ((mes->adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 0x82) - mes_set_hw_res_pkt.enable_lr_compute_wa = 1; - else - dev_info_once(adev->dev, - "MES FW version must be >= 0x82 to enable LR compute workaround.\n"); /* * Keep oversubscribe timer for sdma . When we have unmapped doorbell -- cgit v1.2.3