From a813437c33842c4e28a0656a9d8f20c3a8d35d6d Mon Sep 17 00:00:00 2001 From: YiPeng Chai Date: Wed, 16 Jul 2025 11:16:20 +0800 Subject: drm/amdgpu: add command to check address validity Add command to check address validity and remove unused command codes. v2: The command interface adds new parameters to support multiple check address strategies. Signed-off-by: YiPeng Chai Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 927d6bff734a..7f10a7402160 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -570,6 +570,9 @@ struct amdgpu_ras { struct ras_event_manager *event_mgr; uint64_t reserved_pages_in_bytes; + + pid_t init_task_pid; + char init_task_comm[TASK_COMM_LEN]; }; struct ras_fs_data { -- cgit v1.2.3 From d45c5e6845a76169ef3d6076f0f04487e5776905 Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Fri, 4 Jul 2025 17:12:24 +0800 Subject: drm/amdgpu: adjust the update of RAS bad page number One eeprom record may not map to unit number of bad pages, the accurate bad page number is gotten after bad page address check. Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 7f10a7402160..aa155cb5d58e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -573,6 +573,8 @@ struct amdgpu_ras { pid_t init_task_pid; char init_task_comm[TASK_COMM_LEN]; + + int bad_page_num; }; struct ras_fs_data { @@ -611,6 +613,7 @@ struct ras_err_handler_data { struct eeprom_table_record *bps; /* the count of entries */ int count; + int count_saved; /* the space can place new entries */ int space_left; }; -- cgit v1.2.3 From f3486918979030f8982e1af901561dbd6e2cd1bc Mon Sep 17 00:00:00 2001 From: YiPeng Chai Date: Thu, 24 Jul 2025 15:34:29 +0800 Subject: drm/amdgpu: support ras critical address check Support ras critical address check. Signed-off-by: YiPeng Chai Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index aa155cb5d58e..434e23c84962 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -496,6 +496,13 @@ struct ras_ecc_log_info { uint64_t prev_de_queried_count; }; +struct ras_critical_region { + struct list_head node; + struct amdgpu_bo *bo; + uint64_t start; + uint64_t size; +}; + struct amdgpu_ras { /* ras infrastructure */ /* for ras itself. */ @@ -575,6 +582,10 @@ struct amdgpu_ras { char init_task_comm[TASK_COMM_LEN]; int bad_page_num; + + struct list_head critical_region_head; + struct mutex critical_region_lock; + }; struct ras_fs_data { @@ -979,6 +990,9 @@ int amdgpu_ras_mark_ras_event_caller(struct amdgpu_device *adev, enum ras_event_ int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn); +int amdgpu_ras_add_critical_region(struct amdgpu_device *adev, struct amdgpu_bo *bo); +bool amdgpu_ras_check_critical_address(struct amdgpu_device *adev, uint64_t addr); + int amdgpu_ras_put_poison_req(struct amdgpu_device *adev, enum amdgpu_ras_block block, uint16_t pasid, pasid_notify pasid_fn, void *data, uint32_t reset); -- cgit v1.2.3 From 21c0ffa612c98bcc6dab5bd9d977a18d565ee28e Mon Sep 17 00:00:00 2001 From: Ce Sun Date: Sun, 27 Jul 2025 12:06:55 +0800 Subject: drm/amdgpu: Avoid rma causes GPU duplicate reset Try to ensure poison creation handle is completed in time to set device rma value. Signed-off-by: Ce Sun Signed-off-by: Stanley.Yang Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 434e23c84962..ff63020f9c6c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -522,6 +522,7 @@ struct amdgpu_ras { /* gpu recovery */ struct work_struct recovery_work; atomic_t in_recovery; + atomic_t rma_in_recovery; struct amdgpu_device *adev; /* error handler data */ struct ras_err_handler_data *eh_data; -- cgit v1.2.3 From 0989b764f43d4de2c6665c15165c251d9cfde9c0 Mon Sep 17 00:00:00 2001 From: Ce Sun Date: Tue, 19 Aug 2025 14:47:05 +0800 Subject: drm/amdgpu: Add a mutex lock to protect poison injection When poison is triggered multiple times, competition will occur. Add a mutex lock to protect poison injection Signed-off-by: Ce Sun Reviewed-by: Yang Wang Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index ff63020f9c6c..2a70782b07bf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -587,6 +587,8 @@ struct amdgpu_ras { struct list_head critical_region_head; struct mutex critical_region_lock; + /* Protect poison injection */ + struct mutex poison_lock; }; struct ras_fs_data { -- cgit v1.2.3 From d8442bcad0764c5613e9f8b2356f3e0a48327e20 Mon Sep 17 00:00:00 2001 From: Ce Sun Date: Wed, 20 Aug 2025 17:18:57 +0800 Subject: drm/amdgpu: Correct the loss of aca bank reg info By polling, poll ACA bank count to ensure that valid ACA bank reg info can be obtained v2: add corresponding delay before send msg to SMU to query mca bank info (Stanley) v3: the loop cannot exit. (Thomas) v4: remove amdgpu_aca_clear_bank_count. (Kevin) v5: continuously inject ce. If a creation interruption occurs at this time, bank reg info will be lost. (Thomas) v5: each cycle is delayed by 100ms. (Tao) Signed-off-by: Ce Sun Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 2a70782b07bf..6cf0dfd38be8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -492,8 +492,8 @@ struct ras_ecc_err { struct ras_ecc_log_info { struct mutex lock; struct radix_tree_root de_page_tree; - uint64_t de_queried_count; - uint64_t prev_de_queried_count; + uint64_t de_queried_count; + uint64_t consumption_q_count; }; struct ras_critical_region { @@ -565,6 +565,7 @@ struct amdgpu_ras { struct mutex page_retirement_lock; atomic_t page_retirement_req_cnt; atomic_t poison_creation_count; + atomic_t poison_consumption_count; struct mutex page_rsv_lock; DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, 128); struct ras_ecc_log_info umc_ecc_log; -- cgit v1.2.3