diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd')
| -rw-r--r-- | drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h | 62 | ||||
| -rw-r--r-- | drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm | 37 | ||||
| -rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_queue.c | 1 | ||||
| -rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 46 | ||||
| -rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 4 |
5 files changed, 110 insertions, 40 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h index 0320163b6e74..f98c735b2905 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h @@ -3644,14 +3644,18 @@ static const uint32_t cwsr_trap_gfx9_4_3_hex[] = { }; static const uint32_t cwsr_trap_gfx12_hex[] = { - 0xbfa00001, 0xbfa002a2, - 0xb0804009, 0xb8f8f804, + 0xbfa00001, 0xbfa002b2, + 0xb0804009, 0xb8eef81a, + 0xbf880000, 0xb980081a, + 0x00000000, 0xb8f8f804, + 0x9177ff77, 0x0c000000, + 0x846e9a6e, 0x8c776e77, 0x9178ff78, 0x00008c00, 0xb8fbf811, 0x8b6eff78, 0x00004000, 0xbfa10008, 0x8b6eff7b, 0x00000080, 0xbfa20018, 0x8b6ea07b, - 0xbfa20042, 0xbf830010, + 0xbfa2004a, 0xbf830010, 0xb8fbf811, 0xbfa0fffb, 0x8b6eff7b, 0x00000bd0, 0xbfa20010, 0xb8eef812, @@ -3662,28 +3666,32 @@ static const uint32_t cwsr_trap_gfx12_hex[] = { 0xf0000000, 0xbfa20005, 0x8b6fff6f, 0x00000200, 0xbfa20002, 0x8b6ea07b, - 0xbfa2002c, 0xbefa4d82, + 0xbfa20034, 0xbefa4d82, 0xbf8a0000, 0x84fa887a, 0xbf0d8f7b, 0xbfa10002, 0x8c7bff7b, 0xffff0000, - 0xf4601bbd, 0xf8000010, - 0xbf8a0000, 0x846e976e, - 0x9177ff77, 0x00800000, - 0x8c776e77, 0xf4603bbd, - 0xf8000000, 0xbf8a0000, - 0xf4603ebd, 0xf8000008, - 0xbf8a0000, 0x8bee6e6e, - 0xbfa10001, 0xbe80486e, - 0x8b6eff6d, 0xf0000000, - 0xbfa20009, 0xb8eef811, - 0x8b6eff6e, 0x00000080, - 0xbfa20007, 0x8c78ff78, - 0x00004000, 0x80ec886c, - 0x82ed806d, 0xbfa00002, - 0x806c846c, 0x826d806d, - 0x8b6dff6d, 0x0000ffff, - 0x8bfe7e7e, 0x8bea6a6a, - 0x85788978, 0xb9783244, + 0x8b6eff77, 0x0c000000, + 0x916dff6d, 0x0c000000, + 0x8c6d6e6d, 0xf4601bbd, + 0xf8000010, 0xbf8a0000, + 0x846e976e, 0x9177ff77, + 0x00800000, 0x8c776e77, + 0xf4603bbd, 0xf8000000, + 0xbf8a0000, 0xf4603ebd, + 0xf8000008, 0xbf8a0000, + 0x8bee6e6e, 0xbfa10001, + 0xbe80486e, 0x8b6eff6d, + 0xf0000000, 0xbfa20009, + 0xb8eef811, 0x8b6eff6e, + 0x00000080, 0xbfa20007, + 0x8c78ff78, 0x00004000, + 0x80ec886c, 0x82ed806d, + 0xbfa00002, 0x806c846c, + 0x826d806d, 0x8b6dff6d, + 0x0000ffff, 0x8bfe7e7e, + 0x8bea6a6a, 0x85788978, + 0x936eff77, 0x0002001a, + 0xb96ef81a, 0xb9783244, 0xbe804a6c, 0xb8faf802, 0xbf0d987a, 0xbfa10001, 0xbfb00000, 0x8b6dff6d, @@ -3981,7 +3989,7 @@ static const uint32_t cwsr_trap_gfx12_hex[] = { 0x008ce800, 0x00000000, 0x807d817d, 0x8070ff70, 0x00000080, 0xbf0a7b7d, - 0xbfa2fff7, 0xbfa0016e, + 0xbfa2fff7, 0xbfa00171, 0xbef4007e, 0x8b75ff7f, 0x0000ffff, 0x8c75ff75, 0x00040000, 0xbef60080, @@ -4163,12 +4171,14 @@ static const uint32_t cwsr_trap_gfx12_hex[] = { 0xf8000074, 0xbf8a0000, 0x8b6dff6d, 0x0000ffff, 0x8bfe7e7e, 0x8bea6a6a, - 0xb97af804, 0xbe804ec2, - 0xbf94fffe, 0xbe804a6c, + 0x936eff77, 0x0002001a, + 0xb96ef81a, 0xb97af804, 0xbe804ec2, 0xbf94fffe, - 0xbfb10000, 0xbf9f0000, + 0xbe804a6c, 0xbe804ec2, + 0xbf94fffe, 0xbfb10000, 0xbf9f0000, 0xbf9f0000, 0xbf9f0000, 0xbf9f0000, + 0xbf9f0000, 0x00000000, }; static const uint32_t cwsr_trap_gfx9_5_0_hex[] = { diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm index 5a1a1b1f897f..07999b4649de 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm @@ -78,9 +78,16 @@ var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SHIFT = SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SIZE = SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_SHIFT - SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SHIFT = SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SIZE = 32 - SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SHIFT + +var SQ_WAVE_SCHED_MODE_DEP_MODE_SHIFT = 0 +var SQ_WAVE_SCHED_MODE_DEP_MODE_SIZE = 2 + var BARRIER_STATE_SIGNAL_OFFSET = 16 var BARRIER_STATE_VALID_OFFSET = 0 +var TTMP11_SCHED_MODE_SHIFT = 26 +var TTMP11_SCHED_MODE_SIZE = 2 +var TTMP11_SCHED_MODE_MASK = 0xC000000 var TTMP11_DEBUG_TRAP_ENABLED_SHIFT = 23 var TTMP11_DEBUG_TRAP_ENABLED_MASK = 0x800000 @@ -160,8 +167,19 @@ L_JUMP_TO_RESTORE: s_branch L_RESTORE L_SKIP_RESTORE: + // Assume most relaxed scheduling mode is set. Save and revert to normal mode. + s_getreg_b32 ttmp2, hwreg(HW_REG_WAVE_SCHED_MODE) + s_wait_alu 0 + s_setreg_imm32_b32 hwreg(HW_REG_WAVE_SCHED_MODE, \ + SQ_WAVE_SCHED_MODE_DEP_MODE_SHIFT, SQ_WAVE_SCHED_MODE_DEP_MODE_SIZE), 0 + s_getreg_b32 s_save_state_priv, hwreg(HW_REG_WAVE_STATE_PRIV) //save STATUS since we will change SCC + // Save SCHED_MODE[1:0] into ttmp11[27:26]. + s_andn2_b32 ttmp11, ttmp11, TTMP11_SCHED_MODE_MASK + s_lshl_b32 ttmp2, ttmp2, TTMP11_SCHED_MODE_SHIFT + s_or_b32 ttmp11, ttmp11, ttmp2 + // Clear SPI_PRIO: do not save with elevated priority. // Clear ECC_ERR: prevents SQC store and triggers FATAL_HALT if setreg'd. s_andn2_b32 s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_ALWAYS_CLEAR_MASK @@ -238,6 +256,13 @@ L_FETCH_2ND_TRAP: s_cbranch_scc0 L_NO_SIGN_EXTEND_TMA s_or_b32 ttmp15, ttmp15, 0xFFFF0000 L_NO_SIGN_EXTEND_TMA: +#if ASIC_FAMILY == CHIP_GFX12 + // Move SCHED_MODE[1:0] from ttmp11 to unused bits in ttmp1[27:26] (return PC_HI). + // The second-level trap will restore from ttmp1 for backwards compatibility. + s_and_b32 ttmp2, ttmp11, TTMP11_SCHED_MODE_MASK + s_andn2_b32 ttmp1, ttmp1, TTMP11_SCHED_MODE_MASK + s_or_b32 ttmp1, ttmp1, ttmp2 +#endif s_load_dword ttmp2, [ttmp14, ttmp15], 0x10 scope:SCOPE_SYS // debug trap enabled flag s_wait_idle @@ -287,6 +312,10 @@ L_EXIT_TRAP: // STATE_PRIV.BARRIER_COMPLETE may have changed since we read it. // Only restore fields which the trap handler changes. s_lshr_b32 s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_SCC_SHIFT + + // Assume relaxed scheduling mode after this point. + restore_sched_mode(ttmp2) + s_setreg_b32 hwreg(HW_REG_WAVE_STATE_PRIV, SQ_WAVE_STATE_PRIV_SCC_SHIFT, \ SQ_WAVE_STATE_PRIV_POISON_ERR_SHIFT - SQ_WAVE_STATE_PRIV_SCC_SHIFT + 1), s_save_state_priv @@ -1043,6 +1072,9 @@ L_SKIP_BARRIER_RESTORE: s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 + // Assume relaxed scheduling mode after this point. + restore_sched_mode(s_restore_tmp) + s_setreg_b32 hwreg(HW_REG_WAVE_STATE_PRIV), s_restore_state_priv // SCC is included, which is changed by previous salu // Make barrier and LDS state visible to all waves in the group. @@ -1134,3 +1166,8 @@ function valu_sgpr_hazard end #endif end + +function restore_sched_mode(s_tmp) + s_bfe_u32 s_tmp, ttmp11, (TTMP11_SCHED_MODE_SHIFT | (TTMP11_SCHED_MODE_SIZE << 0x10)) + s_setreg_b32 hwreg(HW_REG_WAVE_SCHED_MODE), s_tmp +end diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c index f1e7583650c4..80c4fa2b0975 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c @@ -409,6 +409,7 @@ static u32 kfd_get_vgpr_size_per_cu(u32 gfxv) vgpr_size = 0x80000; else if (gfxv == 110000 || /* GFX_VERSION_PLUM_BONITO */ gfxv == 110001 || /* GFX_VERSION_WHEAT_NAS */ + gfxv == 110501 || /* GFX_VERSION_GFX1151 */ gfxv == 120000 || /* GFX_VERSION_GFX1200 */ gfxv == 120001) /* GFX_VERSION_GFX1201 */ vgpr_size = 0x60000; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 97c2270f278f..79ea138897fc 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -1144,30 +1144,48 @@ static int svm_range_split_tail(struct svm_range *prange, uint64_t new_last, struct list_head *insert_list, struct list_head *remap_list) { + unsigned long last_align_down = ALIGN_DOWN(prange->last, 512); + unsigned long start_align = ALIGN(prange->start, 512); + bool huge_page_mapping = last_align_down > start_align; struct svm_range *tail = NULL; - int r = svm_range_split(prange, prange->start, new_last, &tail); + int r; - if (!r) { - list_add(&tail->list, insert_list); - if (!IS_ALIGNED(new_last + 1, 1UL << prange->granularity)) - list_add(&tail->update_list, remap_list); - } - return r; + r = svm_range_split(prange, prange->start, new_last, &tail); + + if (r) + return r; + + list_add(&tail->list, insert_list); + + if (huge_page_mapping && tail->start > start_align && + tail->start < last_align_down && (!IS_ALIGNED(tail->start, 512))) + list_add(&tail->update_list, remap_list); + + return 0; } static int svm_range_split_head(struct svm_range *prange, uint64_t new_start, struct list_head *insert_list, struct list_head *remap_list) { + unsigned long last_align_down = ALIGN_DOWN(prange->last, 512); + unsigned long start_align = ALIGN(prange->start, 512); + bool huge_page_mapping = last_align_down > start_align; struct svm_range *head = NULL; - int r = svm_range_split(prange, new_start, prange->last, &head); + int r; - if (!r) { - list_add(&head->list, insert_list); - if (!IS_ALIGNED(new_start, 1UL << prange->granularity)) - list_add(&head->update_list, remap_list); - } - return r; + r = svm_range_split(prange, new_start, prange->last, &head); + + if (r) + return r; + + list_add(&head->list, insert_list); + + if (huge_page_mapping && head->last + 1 > start_align && + head->last + 1 < last_align_down && (!IS_ALIGNED(head->last, 512))) + list_add(&head->update_list, remap_list); + + return 0; } static void diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index 811636af14ea..3eb32d58a120 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -491,6 +491,10 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, dev->node_props.num_sdma_queues_per_engine); sysfs_show_32bit_prop(buffer, offs, "num_cp_queues", dev->node_props.num_cp_queues); + sysfs_show_32bit_prop(buffer, offs, "cwsr_size", + dev->node_props.cwsr_size); + sysfs_show_32bit_prop(buffer, offs, "ctl_stack_size", + dev->node_props.ctl_stack_size); if (dev->gpu) { log_max_watch_addr = |
