diff options
| -rw-r--r-- | drivers/gpu/drm/xe/Kconfig | 8 | ||||
| -rw-r--r-- | drivers/gpu/drm/xe/xe_device.c | 72 | ||||
| -rw-r--r-- | drivers/gpu/drm/xe/xe_drv.h | 2 | ||||
| -rw-r--r-- | drivers/gpu/drm/xe/xe_guc_pc.c | 284 | ||||
| -rw-r--r-- | drivers/gpu/drm/xe/xe_guc_pc.h | 2 | ||||
| -rw-r--r-- | drivers/gpu/drm/xe/xe_guc_pc_types.h | 2 | ||||
| -rw-r--r-- | drivers/gpu/drm/xe/xe_guc_submit.c | 10 | ||||
| -rw-r--r-- | drivers/gpu/drm/xe/xe_lrc.c | 37 | ||||
| -rw-r--r-- | drivers/gpu/drm/xe/xe_lrc_types.h | 3 | ||||
| -rw-r--r-- | drivers/gpu/drm/xe/xe_migrate.c | 18 | ||||
| -rw-r--r-- | drivers/gpu/drm/xe/xe_wa_oob.rules | 7 |
11 files changed, 310 insertions, 135 deletions
diff --git a/drivers/gpu/drm/xe/Kconfig b/drivers/gpu/drm/xe/Kconfig index fcc2677a4229..99a91355842e 100644 --- a/drivers/gpu/drm/xe/Kconfig +++ b/drivers/gpu/drm/xe/Kconfig @@ -1,7 +1,8 @@ # SPDX-License-Identifier: GPL-2.0-only config DRM_XE - tristate "Intel Xe Graphics" - depends on DRM && PCI && (m || (y && KUNIT=y)) + tristate "Intel Xe2 Graphics" + depends on DRM && PCI + depends on KUNIT || !KUNIT depends on INTEL_VSEC || !INTEL_VSEC depends on X86_PLATFORM_DEVICES || !(X86 && ACPI) select INTERVAL_TREE @@ -46,7 +47,8 @@ config DRM_XE select AUXILIARY_BUS select HMM_MIRROR help - Experimental driver for Intel Xe series GPUs + Driver for Intel Xe2 series GPUs and later. Experimental support + for Xe series is also available. If "M" is selected, the module will be called xe. diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index c02c4c4e9412..e9f3c1a53db2 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -40,6 +40,7 @@ #include "xe_gt_printk.h" #include "xe_gt_sriov_vf.h" #include "xe_guc.h" +#include "xe_guc_pc.h" #include "xe_hw_engine_group.h" #include "xe_hwmon.h" #include "xe_irq.h" @@ -986,38 +987,15 @@ void xe_device_wmb(struct xe_device *xe) xe_mmio_write32(xe_root_tile_mmio(xe), VF_CAP_REG, 0); } -/** - * xe_device_td_flush() - Flush transient L3 cache entries - * @xe: The device - * - * Display engine has direct access to memory and is never coherent with L3/L4 - * caches (or CPU caches), however KMD is responsible for specifically flushing - * transient L3 GPU cache entries prior to the flip sequence to ensure scanout - * can happen from such a surface without seeing corruption. - * - * Display surfaces can be tagged as transient by mapping it using one of the - * various L3:XD PAT index modes on Xe2. - * - * Note: On non-discrete xe2 platforms, like LNL, the entire L3 cache is flushed - * at the end of each submission via PIPE_CONTROL for compute/render, since SA - * Media is not coherent with L3 and we want to support render-vs-media - * usescases. For other engines like copy/blt the HW internally forces uncached - * behaviour, hence why we can skip the TDF on such platforms. +/* + * Issue a TRANSIENT_FLUSH_REQUEST and wait for completion on each gt. */ -void xe_device_td_flush(struct xe_device *xe) +static void tdf_request_sync(struct xe_device *xe) { - struct xe_gt *gt; unsigned int fw_ref; + struct xe_gt *gt; u8 id; - if (!IS_DGFX(xe) || GRAPHICS_VER(xe) < 20) - return; - - if (XE_WA(xe_root_mmio_gt(xe), 16023588340)) { - xe_device_l2_flush(xe); - return; - } - for_each_gt(gt, xe, id) { if (xe_gt_is_media_type(gt)) continue; @@ -1027,6 +1005,7 @@ void xe_device_td_flush(struct xe_device *xe) return; xe_mmio_write32(>->mmio, XE2_TDF_CTRL, TRANSIENT_FLUSH_REQUEST); + /* * FIXME: We can likely do better here with our choice of * timeout. Currently we just assume the worst case, i.e. 150us, @@ -1057,15 +1036,52 @@ void xe_device_l2_flush(struct xe_device *xe) return; spin_lock(>->global_invl_lock); - xe_mmio_write32(>->mmio, XE2_GLOBAL_INVAL, 0x1); + xe_mmio_write32(>->mmio, XE2_GLOBAL_INVAL, 0x1); if (xe_mmio_wait32(>->mmio, XE2_GLOBAL_INVAL, 0x1, 0x0, 500, NULL, true)) xe_gt_err_once(gt, "Global invalidation timeout\n"); + spin_unlock(>->global_invl_lock); xe_force_wake_put(gt_to_fw(gt), fw_ref); } +/** + * xe_device_td_flush() - Flush transient L3 cache entries + * @xe: The device + * + * Display engine has direct access to memory and is never coherent with L3/L4 + * caches (or CPU caches), however KMD is responsible for specifically flushing + * transient L3 GPU cache entries prior to the flip sequence to ensure scanout + * can happen from such a surface without seeing corruption. + * + * Display surfaces can be tagged as transient by mapping it using one of the + * various L3:XD PAT index modes on Xe2. + * + * Note: On non-discrete xe2 platforms, like LNL, the entire L3 cache is flushed + * at the end of each submission via PIPE_CONTROL for compute/render, since SA + * Media is not coherent with L3 and we want to support render-vs-media + * usescases. For other engines like copy/blt the HW internally forces uncached + * behaviour, hence why we can skip the TDF on such platforms. + */ +void xe_device_td_flush(struct xe_device *xe) +{ + struct xe_gt *root_gt; + + if (!IS_DGFX(xe) || GRAPHICS_VER(xe) < 20) + return; + + root_gt = xe_root_mmio_gt(xe); + if (XE_WA(root_gt, 16023588340)) { + /* A transient flush is not sufficient: flush the L2 */ + xe_device_l2_flush(xe); + } else { + xe_guc_pc_apply_flush_freq_limit(&root_gt->uc.guc.pc); + tdf_request_sync(xe); + xe_guc_pc_remove_flush_freq_limit(&root_gt->uc.guc.pc); + } +} + u32 xe_device_ccs_bytes(struct xe_device *xe, u64 size) { return xe_device_has_flat_ccs(xe) ? diff --git a/drivers/gpu/drm/xe/xe_drv.h b/drivers/gpu/drm/xe/xe_drv.h index d61650d4aa0b..95242a375e54 100644 --- a/drivers/gpu/drm/xe/xe_drv.h +++ b/drivers/gpu/drm/xe/xe_drv.h @@ -9,7 +9,7 @@ #include <drm/drm_drv.h> #define DRIVER_NAME "xe" -#define DRIVER_DESC "Intel Xe Graphics" +#define DRIVER_DESC "Intel Xe2 Graphics" /* Interface history: * diff --git a/drivers/gpu/drm/xe/xe_guc_pc.c b/drivers/gpu/drm/xe/xe_guc_pc.c index 3beaaa7b25c1..c0ca61695d76 100644 --- a/drivers/gpu/drm/xe/xe_guc_pc.c +++ b/drivers/gpu/drm/xe/xe_guc_pc.c @@ -5,8 +5,11 @@ #include "xe_guc_pc.h" +#include <linux/cleanup.h> #include <linux/delay.h> +#include <linux/jiffies.h> #include <linux/ktime.h> +#include <linux/wait_bit.h> #include <drm/drm_managed.h> #include <drm/drm_print.h> @@ -51,9 +54,12 @@ #define LNL_MERT_FREQ_CAP 800 #define BMG_MERT_FREQ_CAP 2133 +#define BMG_MIN_FREQ 1200 +#define BMG_MERT_FLUSH_FREQ_CAP 2600 #define SLPC_RESET_TIMEOUT_MS 5 /* roughly 5ms, but no need for precision */ #define SLPC_RESET_EXTENDED_TIMEOUT_MS 1000 /* To be used only at pc_start */ +#define SLPC_ACT_FREQ_TIMEOUT_MS 100 /** * DOC: GuC Power Conservation (PC) @@ -141,6 +147,36 @@ static int wait_for_pc_state(struct xe_guc_pc *pc, return -ETIMEDOUT; } +static int wait_for_flush_complete(struct xe_guc_pc *pc) +{ + const unsigned long timeout = msecs_to_jiffies(30); + + if (!wait_var_event_timeout(&pc->flush_freq_limit, + !atomic_read(&pc->flush_freq_limit), + timeout)) + return -ETIMEDOUT; + + return 0; +} + +static int wait_for_act_freq_limit(struct xe_guc_pc *pc, u32 freq) +{ + int timeout_us = SLPC_ACT_FREQ_TIMEOUT_MS * USEC_PER_MSEC; + int slept, wait = 10; + + for (slept = 0; slept < timeout_us;) { + if (xe_guc_pc_get_act_freq(pc) <= freq) + return 0; + + usleep_range(wait, wait << 1); + slept += wait; + wait <<= 1; + if (slept + wait > timeout_us) + wait = timeout_us - slept; + } + + return -ETIMEDOUT; +} static int pc_action_reset(struct xe_guc_pc *pc) { struct xe_guc_ct *ct = pc_to_ct(pc); @@ -553,6 +589,25 @@ u32 xe_guc_pc_get_rpn_freq(struct xe_guc_pc *pc) return pc->rpn_freq; } +static int xe_guc_pc_get_min_freq_locked(struct xe_guc_pc *pc, u32 *freq) +{ + int ret; + + lockdep_assert_held(&pc->freq_lock); + + /* Might be in the middle of a gt reset */ + if (!pc->freq_ready) + return -EAGAIN; + + ret = pc_action_query_task_state(pc); + if (ret) + return ret; + + *freq = pc_get_min_freq(pc); + + return 0; +} + /** * xe_guc_pc_get_min_freq - Get the min operational frequency * @pc: The GuC PC @@ -563,26 +618,28 @@ u32 xe_guc_pc_get_rpn_freq(struct xe_guc_pc *pc) */ int xe_guc_pc_get_min_freq(struct xe_guc_pc *pc, u32 *freq) { + guard(mutex)(&pc->freq_lock); + + return xe_guc_pc_get_min_freq_locked(pc, freq); +} + +static int xe_guc_pc_set_min_freq_locked(struct xe_guc_pc *pc, u32 freq) +{ int ret; - xe_device_assert_mem_access(pc_to_xe(pc)); + lockdep_assert_held(&pc->freq_lock); - mutex_lock(&pc->freq_lock); - if (!pc->freq_ready) { - /* Might be in the middle of a gt reset */ - ret = -EAGAIN; - goto out; - } + /* Might be in the middle of a gt reset */ + if (!pc->freq_ready) + return -EAGAIN; - ret = pc_action_query_task_state(pc); + ret = pc_set_min_freq(pc, freq); if (ret) - goto out; + return ret; - *freq = pc_get_min_freq(pc); + pc->user_requested_min = freq; -out: - mutex_unlock(&pc->freq_lock); - return ret; + return 0; } /** @@ -596,24 +653,28 @@ out: */ int xe_guc_pc_set_min_freq(struct xe_guc_pc *pc, u32 freq) { + guard(mutex)(&pc->freq_lock); + + return xe_guc_pc_set_min_freq_locked(pc, freq); +} + +static int xe_guc_pc_get_max_freq_locked(struct xe_guc_pc *pc, u32 *freq) +{ int ret; - mutex_lock(&pc->freq_lock); - if (!pc->freq_ready) { - /* Might be in the middle of a gt reset */ - ret = -EAGAIN; - goto out; - } + lockdep_assert_held(&pc->freq_lock); - ret = pc_set_min_freq(pc, freq); + /* Might be in the middle of a gt reset */ + if (!pc->freq_ready) + return -EAGAIN; + + ret = pc_action_query_task_state(pc); if (ret) - goto out; + return ret; - pc->user_requested_min = freq; + *freq = pc_get_max_freq(pc); -out: - mutex_unlock(&pc->freq_lock); - return ret; + return 0; } /** @@ -626,24 +687,28 @@ out: */ int xe_guc_pc_get_max_freq(struct xe_guc_pc *pc, u32 *freq) { + guard(mutex)(&pc->freq_lock); + + return xe_guc_pc_get_max_freq_locked(pc, freq); +} + +static int xe_guc_pc_set_max_freq_locked(struct xe_guc_pc *pc, u32 freq) +{ int ret; - mutex_lock(&pc->freq_lock); - if (!pc->freq_ready) { - /* Might be in the middle of a gt reset */ - ret = -EAGAIN; - goto out; - } + lockdep_assert_held(&pc->freq_lock); - ret = pc_action_query_task_state(pc); + /* Might be in the middle of a gt reset */ + if (!pc->freq_ready) + return -EAGAIN; + + ret = pc_set_max_freq(pc, freq); if (ret) - goto out; + return ret; - *freq = pc_get_max_freq(pc); + pc->user_requested_max = freq; -out: - mutex_unlock(&pc->freq_lock); - return ret; + return 0; } /** @@ -657,24 +722,14 @@ out: */ int xe_guc_pc_set_max_freq(struct xe_guc_pc *pc, u32 freq) { - int ret; - - mutex_lock(&pc->freq_lock); - if (!pc->freq_ready) { - /* Might be in the middle of a gt reset */ - ret = -EAGAIN; - goto out; + if (XE_WA(pc_to_gt(pc), 22019338487)) { + if (wait_for_flush_complete(pc) != 0) + return -EAGAIN; } - ret = pc_set_max_freq(pc, freq); - if (ret) - goto out; - - pc->user_requested_max = freq; + guard(mutex)(&pc->freq_lock); -out: - mutex_unlock(&pc->freq_lock); - return ret; + return xe_guc_pc_set_max_freq_locked(pc, freq); } /** @@ -817,6 +872,7 @@ void xe_guc_pc_init_early(struct xe_guc_pc *pc) static int pc_adjust_freq_bounds(struct xe_guc_pc *pc) { + struct xe_tile *tile = gt_to_tile(pc_to_gt(pc)); int ret; lockdep_assert_held(&pc->freq_lock); @@ -843,6 +899,9 @@ static int pc_adjust_freq_bounds(struct xe_guc_pc *pc) if (pc_get_min_freq(pc) > pc->rp0_freq) ret = pc_set_min_freq(pc, pc->rp0_freq); + if (XE_WA(tile->primary_gt, 14022085890)) + ret = pc_set_min_freq(pc, max(BMG_MIN_FREQ, pc_get_min_freq(pc))); + out: return ret; } @@ -868,30 +927,117 @@ static int pc_adjust_requested_freq(struct xe_guc_pc *pc) return ret; } -static int pc_set_mert_freq_cap(struct xe_guc_pc *pc) +static bool needs_flush_freq_limit(struct xe_guc_pc *pc) { - int ret = 0; + struct xe_gt *gt = pc_to_gt(pc); - if (XE_WA(pc_to_gt(pc), 22019338487)) { - /* - * Get updated min/max and stash them. - */ - ret = xe_guc_pc_get_min_freq(pc, &pc->stashed_min_freq); - if (!ret) - ret = xe_guc_pc_get_max_freq(pc, &pc->stashed_max_freq); - if (ret) - return ret; + return XE_WA(gt, 22019338487) && + pc->rp0_freq > BMG_MERT_FLUSH_FREQ_CAP; +} + +/** + * xe_guc_pc_apply_flush_freq_limit() - Limit max GT freq during L2 flush + * @pc: the xe_guc_pc object + * + * As per the WA, reduce max GT frequency during L2 cache flush + */ +void xe_guc_pc_apply_flush_freq_limit(struct xe_guc_pc *pc) +{ + struct xe_gt *gt = pc_to_gt(pc); + u32 max_freq; + int ret; + + if (!needs_flush_freq_limit(pc)) + return; + + guard(mutex)(&pc->freq_lock); + + ret = xe_guc_pc_get_max_freq_locked(pc, &max_freq); + if (!ret && max_freq > BMG_MERT_FLUSH_FREQ_CAP) { + ret = pc_set_max_freq(pc, BMG_MERT_FLUSH_FREQ_CAP); + if (ret) { + xe_gt_err_once(gt, "Failed to cap max freq on flush to %u, %pe\n", + BMG_MERT_FLUSH_FREQ_CAP, ERR_PTR(ret)); + return; + } + + atomic_set(&pc->flush_freq_limit, 1); /* - * Ensure min and max are bound by MERT_FREQ_CAP until driver loads. + * If user has previously changed max freq, stash that value to + * restore later, otherwise use the current max. New user + * requests wait on flush. */ - mutex_lock(&pc->freq_lock); - ret = pc_set_min_freq(pc, min(pc->rpe_freq, pc_max_freq_cap(pc))); - if (!ret) - ret = pc_set_max_freq(pc, min(pc->rp0_freq, pc_max_freq_cap(pc))); - mutex_unlock(&pc->freq_lock); + if (pc->user_requested_max != 0) + pc->stashed_max_freq = pc->user_requested_max; + else + pc->stashed_max_freq = max_freq; } + /* + * Wait for actual freq to go below the flush cap: even if the previous + * max was below cap, the current one might still be above it + */ + ret = wait_for_act_freq_limit(pc, BMG_MERT_FLUSH_FREQ_CAP); + if (ret) + xe_gt_err_once(gt, "Actual freq did not reduce to %u, %pe\n", + BMG_MERT_FLUSH_FREQ_CAP, ERR_PTR(ret)); +} + +/** + * xe_guc_pc_remove_flush_freq_limit() - Remove max GT freq limit after L2 flush completes. + * @pc: the xe_guc_pc object + * + * Retrieve the previous GT max frequency value. + */ +void xe_guc_pc_remove_flush_freq_limit(struct xe_guc_pc *pc) +{ + struct xe_gt *gt = pc_to_gt(pc); + int ret = 0; + + if (!needs_flush_freq_limit(pc)) + return; + + if (!atomic_read(&pc->flush_freq_limit)) + return; + + mutex_lock(&pc->freq_lock); + + ret = pc_set_max_freq(>->uc.guc.pc, pc->stashed_max_freq); + if (ret) + xe_gt_err_once(gt, "Failed to restore max freq %u:%d", + pc->stashed_max_freq, ret); + + atomic_set(&pc->flush_freq_limit, 0); + mutex_unlock(&pc->freq_lock); + wake_up_var(&pc->flush_freq_limit); +} + +static int pc_set_mert_freq_cap(struct xe_guc_pc *pc) +{ + int ret; + + if (!XE_WA(pc_to_gt(pc), 22019338487)) + return 0; + + guard(mutex)(&pc->freq_lock); + + /* + * Get updated min/max and stash them. + */ + ret = xe_guc_pc_get_min_freq_locked(pc, &pc->stashed_min_freq); + if (!ret) + ret = xe_guc_pc_get_max_freq_locked(pc, &pc->stashed_max_freq); + if (ret) + return ret; + + /* + * Ensure min and max are bound by MERT_FREQ_CAP until driver loads. + */ + ret = pc_set_min_freq(pc, min(pc->rpe_freq, pc_max_freq_cap(pc))); + if (!ret) + ret = pc_set_max_freq(pc, min(pc->rp0_freq, pc_max_freq_cap(pc))); + return ret; } diff --git a/drivers/gpu/drm/xe/xe_guc_pc.h b/drivers/gpu/drm/xe/xe_guc_pc.h index 0a2664d5c811..52ecdd5ddbff 100644 --- a/drivers/gpu/drm/xe/xe_guc_pc.h +++ b/drivers/gpu/drm/xe/xe_guc_pc.h @@ -38,5 +38,7 @@ u64 xe_guc_pc_mc6_residency(struct xe_guc_pc *pc); void xe_guc_pc_init_early(struct xe_guc_pc *pc); int xe_guc_pc_restore_stashed_freq(struct xe_guc_pc *pc); void xe_guc_pc_raise_unslice(struct xe_guc_pc *pc); +void xe_guc_pc_apply_flush_freq_limit(struct xe_guc_pc *pc); +void xe_guc_pc_remove_flush_freq_limit(struct xe_guc_pc *pc); #endif /* _XE_GUC_PC_H_ */ diff --git a/drivers/gpu/drm/xe/xe_guc_pc_types.h b/drivers/gpu/drm/xe/xe_guc_pc_types.h index 2978ac9a249b..c02053948a57 100644 --- a/drivers/gpu/drm/xe/xe_guc_pc_types.h +++ b/drivers/gpu/drm/xe/xe_guc_pc_types.h @@ -15,6 +15,8 @@ struct xe_guc_pc { /** @bo: GGTT buffer object that is shared with GuC PC */ struct xe_bo *bo; + /** @flush_freq_limit: 1 when max freq changes are limited by driver */ + atomic_t flush_freq_limit; /** @rp0_freq: HW RP0 frequency - The Maximum one */ u32 rp0_freq; /** @rpa_freq: HW RPa frequency - The Achievable one */ diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index 9567f6700cf2..2ac87ff4a057 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -891,12 +891,13 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w) struct xe_exec_queue *q = ge->q; struct xe_guc *guc = exec_queue_to_guc(q); struct xe_gpu_scheduler *sched = &ge->sched; - bool wedged; + bool wedged = false; xe_gt_assert(guc_to_gt(guc), xe_exec_queue_is_lr(q)); trace_xe_exec_queue_lr_cleanup(q); - wedged = guc_submit_hint_wedged(exec_queue_to_guc(q)); + if (!exec_queue_killed(q)) + wedged = guc_submit_hint_wedged(exec_queue_to_guc(q)); /* Kill the run_job / process_msg entry points */ xe_sched_submission_stop(sched); @@ -1070,7 +1071,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job) int err = -ETIME; pid_t pid = -1; int i = 0; - bool wedged, skip_timeout_check; + bool wedged = false, skip_timeout_check; /* * TDR has fired before free job worker. Common if exec queue @@ -1116,7 +1117,8 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job) * doesn't work for SRIOV. For now assuming timeouts in wedged mode are * genuine timeouts. */ - wedged = guc_submit_hint_wedged(exec_queue_to_guc(q)); + if (!exec_queue_killed(q)) + wedged = guc_submit_hint_wedged(exec_queue_to_guc(q)); /* Engine state now stable, disable scheduling to check timestamp */ if (!wedged && exec_queue_registered(q)) { diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c index bf7c3981897d..6e7b70532d11 100644 --- a/drivers/gpu/drm/xe/xe_lrc.c +++ b/drivers/gpu/drm/xe/xe_lrc.c @@ -40,6 +40,7 @@ #define LRC_PPHWSP_SIZE SZ_4K #define LRC_INDIRECT_RING_STATE_SIZE SZ_4K +#define LRC_WA_BB_SIZE SZ_4K static struct xe_device * lrc_to_xe(struct xe_lrc *lrc) @@ -910,7 +911,11 @@ static void xe_lrc_finish(struct xe_lrc *lrc) { xe_hw_fence_ctx_finish(&lrc->fence_ctx); xe_bo_unpin_map_no_vm(lrc->bo); - xe_bo_unpin_map_no_vm(lrc->bb_per_ctx_bo); +} + +static size_t wa_bb_offset(struct xe_lrc *lrc) +{ + return lrc->bo->size - LRC_WA_BB_SIZE; } /* @@ -943,15 +948,16 @@ static void xe_lrc_finish(struct xe_lrc *lrc) #define CONTEXT_ACTIVE 1ULL static int xe_lrc_setup_utilization(struct xe_lrc *lrc) { + const size_t max_size = LRC_WA_BB_SIZE; u32 *cmd, *buf = NULL; - if (lrc->bb_per_ctx_bo->vmap.is_iomem) { - buf = kmalloc(lrc->bb_per_ctx_bo->size, GFP_KERNEL); + if (lrc->bo->vmap.is_iomem) { + buf = kmalloc(max_size, GFP_KERNEL); if (!buf) return -ENOMEM; cmd = buf; } else { - cmd = lrc->bb_per_ctx_bo->vmap.vaddr; + cmd = lrc->bo->vmap.vaddr + wa_bb_offset(lrc); } *cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET; @@ -974,13 +980,14 @@ static int xe_lrc_setup_utilization(struct xe_lrc *lrc) *cmd++ = MI_BATCH_BUFFER_END; if (buf) { - xe_map_memcpy_to(gt_to_xe(lrc->gt), &lrc->bb_per_ctx_bo->vmap, 0, - buf, (cmd - buf) * sizeof(*cmd)); + xe_map_memcpy_to(gt_to_xe(lrc->gt), &lrc->bo->vmap, + wa_bb_offset(lrc), buf, + (cmd - buf) * sizeof(*cmd)); kfree(buf); } - xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR, - xe_bo_ggtt_addr(lrc->bb_per_ctx_bo) | 1); + xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR, xe_bo_ggtt_addr(lrc->bo) + + wa_bb_offset(lrc) + 1); return 0; } @@ -1018,20 +1025,13 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, * FIXME: Perma-pinning LRC as we don't yet support moving GGTT address * via VM bind calls. */ - lrc->bo = xe_bo_create_pin_map(xe, tile, NULL, lrc_size, + lrc->bo = xe_bo_create_pin_map(xe, tile, NULL, + lrc_size + LRC_WA_BB_SIZE, ttm_bo_type_kernel, bo_flags); if (IS_ERR(lrc->bo)) return PTR_ERR(lrc->bo); - lrc->bb_per_ctx_bo = xe_bo_create_pin_map(xe, tile, NULL, SZ_4K, - ttm_bo_type_kernel, - bo_flags); - if (IS_ERR(lrc->bb_per_ctx_bo)) { - err = PTR_ERR(lrc->bb_per_ctx_bo); - goto err_lrc_finish; - } - lrc->size = lrc_size; lrc->ring.size = ring_size; lrc->ring.tail = 0; @@ -1819,7 +1819,8 @@ struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc) snapshot->seqno = xe_lrc_seqno(lrc); snapshot->lrc_bo = xe_bo_get(lrc->bo); snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc); - snapshot->lrc_size = lrc->bo->size - snapshot->lrc_offset; + snapshot->lrc_size = lrc->bo->size - snapshot->lrc_offset - + LRC_WA_BB_SIZE; snapshot->lrc_snapshot = NULL; snapshot->ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(lrc)); snapshot->ctx_job_timestamp = xe_lrc_ctx_job_timestamp(lrc); diff --git a/drivers/gpu/drm/xe/xe_lrc_types.h b/drivers/gpu/drm/xe/xe_lrc_types.h index ae24cf6f8dd9..883e550a9423 100644 --- a/drivers/gpu/drm/xe/xe_lrc_types.h +++ b/drivers/gpu/drm/xe/xe_lrc_types.h @@ -53,9 +53,6 @@ struct xe_lrc { /** @ctx_timestamp: readout value of CTX_TIMESTAMP on last update */ u64 ctx_timestamp; - - /** @bb_per_ctx_bo: buffer object for per context batch wa buffer */ - struct xe_bo *bb_per_ctx_bo; }; struct xe_lrc_snapshot; diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c index 8f8e9fdfb2a8..7acdc4c78866 100644 --- a/drivers/gpu/drm/xe/xe_migrate.c +++ b/drivers/gpu/drm/xe/xe_migrate.c @@ -82,7 +82,7 @@ struct xe_migrate { * of the instruction. Subtracting the instruction header (1 dword) and * address (2 dwords), that leaves 0x3FD dwords (0x1FE qwords) for PTE values. */ -#define MAX_PTE_PER_SDI 0x1FE +#define MAX_PTE_PER_SDI 0x1FEU /** * xe_tile_migrate_exec_queue() - Get this tile's migrate exec queue. @@ -1553,15 +1553,17 @@ static u32 pte_update_cmd_size(u64 size) u64 entries = DIV_U64_ROUND_UP(size, XE_PAGE_SIZE); XE_WARN_ON(size > MAX_PREEMPTDISABLE_TRANSFER); + /* * MI_STORE_DATA_IMM command is used to update page table. Each - * instruction can update maximumly 0x1ff pte entries. To update - * n (n <= 0x1ff) pte entries, we need: - * 1 dword for the MI_STORE_DATA_IMM command header (opcode etc) - * 2 dword for the page table's physical location - * 2*n dword for value of pte to fill (each pte entry is 2 dwords) + * instruction can update maximumly MAX_PTE_PER_SDI pte entries. To + * update n (n <= MAX_PTE_PER_SDI) pte entries, we need: + * + * - 1 dword for the MI_STORE_DATA_IMM command header (opcode etc) + * - 2 dword for the page table's physical location + * - 2*n dword for value of pte to fill (each pte entry is 2 dwords) */ - num_dword = (1 + 2) * DIV_U64_ROUND_UP(entries, 0x1ff); + num_dword = (1 + 2) * DIV_U64_ROUND_UP(entries, MAX_PTE_PER_SDI); num_dword += entries * 2; return num_dword; @@ -1577,7 +1579,7 @@ static void build_pt_update_batch_sram(struct xe_migrate *m, ptes = DIV_ROUND_UP(size, XE_PAGE_SIZE); while (ptes) { - u32 chunk = min(0x1ffU, ptes); + u32 chunk = min(MAX_PTE_PER_SDI, ptes); bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk); bb->cs[bb->len++] = pt_offset; diff --git a/drivers/gpu/drm/xe/xe_wa_oob.rules b/drivers/gpu/drm/xe/xe_wa_oob.rules index 9efc5accd43d..69c1d7fc695e 100644 --- a/drivers/gpu/drm/xe/xe_wa_oob.rules +++ b/drivers/gpu/drm/xe/xe_wa_oob.rules @@ -21,7 +21,8 @@ GRAPHICS_VERSION_RANGE(1270, 1274) MEDIA_VERSION(1300) PLATFORM(DG2) -14018094691 GRAPHICS_VERSION(2004) +14018094691 GRAPHICS_VERSION_RANGE(2001, 2002) + GRAPHICS_VERSION(2004) 14019882105 GRAPHICS_VERSION(2004), GRAPHICS_STEP(A0, B0) 18024947630 GRAPHICS_VERSION(2001) GRAPHICS_VERSION(2004) @@ -59,3 +60,7 @@ no_media_l3 MEDIA_VERSION(3000) MEDIA_VERSION_RANGE(1301, 3000) 16026508708 GRAPHICS_VERSION_RANGE(1200, 3001) MEDIA_VERSION_RANGE(1300, 3000) + +# SoC workaround - currently applies to all platforms with the following +# primary GT GMDID +14022085890 GRAPHICS_VERSION(2001) |
