summaryrefslogtreecommitdiff
path: root/drivers/misc/habanalabs/habanalabs.h
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/misc/habanalabs/habanalabs.h')
-rw-r--r--drivers/misc/habanalabs/habanalabs.h220
1 files changed, 162 insertions, 58 deletions
diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index a8ee52c880cd..71243b319920 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -11,8 +11,6 @@
#include "include/armcp_if.h"
#include "include/qman_if.h"
-#define pr_fmt(fmt) "habanalabs: " fmt
-
#include <linux/cdev.h>
#include <linux/iopoll.h>
#include <linux/irqreturn.h>
@@ -33,6 +31,9 @@
#define HL_PLL_LOW_JOB_FREQ_USEC 5000000 /* 5 s */
+#define HL_ARMCP_INFO_TIMEOUT_USEC 10000000 /* 10s */
+#define HL_ARMCP_EEPROM_TIMEOUT_USEC 10000000 /* 10s */
+
#define HL_MAX_QUEUES 128
#define HL_MAX_JOBS_PER_CS 64
@@ -48,8 +49,9 @@
/**
* struct pgt_info - MMU hop page info.
- * @node: hash linked-list node for the pgts hash of pgts.
- * @addr: physical address of the pgt.
+ * @node: hash linked-list node for the pgts shadow hash of pgts.
+ * @phys_addr: physical address of the pgt.
+ * @shadow_addr: shadow hop in the host.
* @ctx: pointer to the owner ctx.
* @num_of_ptes: indicates how many ptes are used in the pgt.
*
@@ -59,10 +61,11 @@
* page, it is freed with its pgt_info structure.
*/
struct pgt_info {
- struct hlist_node node;
- u64 addr;
- struct hl_ctx *ctx;
- int num_of_ptes;
+ struct hlist_node node;
+ u64 phys_addr;
+ u64 shadow_addr;
+ struct hl_ctx *ctx;
+ int num_of_ptes;
};
struct hl_device;
@@ -132,8 +135,6 @@ enum hl_device_hw_state {
* @dram_user_base_address: DRAM physical start address for user access.
* @dram_size: DRAM total size.
* @dram_pci_bar_size: size of PCI bar towards DRAM.
- * @host_phys_base_address: base physical address of host memory for
- * transactions that the device generates.
* @max_power_default: max power of the device after reset
* @va_space_host_start_address: base address of virtual memory range for
* mapping host memory.
@@ -145,6 +146,8 @@ enum hl_device_hw_state {
* mapping DRAM memory.
* @dram_size_for_default_page_mapping: DRAM size needed to map to avoid page
* fault.
+ * @pcie_dbi_base_address: Base address of the PCIE_DBI block.
+ * @pcie_aux_dbi_reg_addr: Address of the PCIE_AUX DBI register.
* @mmu_pgt_addr: base physical address in DRAM of MMU page tables.
* @mmu_dram_default_page_addr: DRAM default page physical address.
* @mmu_pgt_size: MMU page tables total size.
@@ -179,13 +182,14 @@ struct asic_fixed_properties {
u64 dram_user_base_address;
u64 dram_size;
u64 dram_pci_bar_size;
- u64 host_phys_base_address;
u64 max_power_default;
u64 va_space_host_start_address;
u64 va_space_host_end_address;
u64 va_space_dram_start_address;
u64 va_space_dram_end_address;
u64 dram_size_for_default_page_mapping;
+ u64 pcie_dbi_base_address;
+ u64 pcie_aux_dbi_reg_addr;
u64 mmu_pgt_addr;
u64 mmu_dram_default_page_addr;
u32 mmu_pgt_size;
@@ -314,6 +318,18 @@ struct hl_cs_job;
#define HL_EQ_LENGTH 64
#define HL_EQ_SIZE_IN_BYTES (HL_EQ_LENGTH * HL_EQ_ENTRY_SIZE)
+#define HL_CPU_PKT_SHIFT 5
+#define HL_CPU_PKT_SIZE (1 << HL_CPU_PKT_SHIFT)
+#define HL_CPU_PKT_MASK (~((1 << HL_CPU_PKT_SHIFT) - 1))
+#define HL_CPU_MAX_PKTS_IN_CB 32
+#define HL_CPU_CB_SIZE (HL_CPU_PKT_SIZE * \
+ HL_CPU_MAX_PKTS_IN_CB)
+#define HL_CPU_CB_QUEUE_SIZE (HL_QUEUE_LENGTH * HL_CPU_CB_SIZE)
+
+/* KMD <-> ArmCP shared memory size (EQ + PQ + CPU CB queue) */
+#define HL_CPU_ACCESSIBLE_MEM_SIZE (HL_EQ_SIZE_IN_BYTES + \
+ HL_QUEUE_SIZE_IN_BYTES + \
+ HL_CPU_CB_QUEUE_SIZE)
/**
* struct hl_hw_queue - describes a H/W transport queue.
@@ -381,14 +397,12 @@ struct hl_eq {
/**
* enum hl_asic_type - supported ASIC types.
- * @ASIC_AUTO_DETECT: ASIC type will be automatically set.
- * @ASIC_GOYA: Goya device.
* @ASIC_INVALID: Invalid ASIC type.
+ * @ASIC_GOYA: Goya device.
*/
enum hl_asic_type {
- ASIC_AUTO_DETECT,
- ASIC_GOYA,
- ASIC_INVALID
+ ASIC_INVALID,
+ ASIC_GOYA
};
struct hl_cs_parser;
@@ -436,19 +450,19 @@ enum hl_pll_frequency {
* @cb_mmap: maps a CB.
* @ring_doorbell: increment PI on a given QMAN.
* @flush_pq_write: flush PQ entry write if necessary, WARN if flushing failed.
- * @dma_alloc_coherent: Allocate coherent DMA memory by calling
- * dma_alloc_coherent(). This is ASIC function because its
- * implementation is not trivial when the driver is loaded
- * in simulation mode (not upstreamed).
- * @dma_free_coherent: Free coherent DMA memory by calling dma_free_coherent().
- * This is ASIC function because its implementation is not
- * trivial when the driver is loaded in simulation mode
- * (not upstreamed).
+ * @asic_dma_alloc_coherent: Allocate coherent DMA memory by calling
+ * dma_alloc_coherent(). This is ASIC function because
+ * its implementation is not trivial when the driver
+ * is loaded in simulation mode (not upstreamed).
+ * @asic_dma_free_coherent: Free coherent DMA memory by calling
+ * dma_free_coherent(). This is ASIC function because
+ * its implementation is not trivial when the driver
+ * is loaded in simulation mode (not upstreamed).
* @get_int_queue_base: get the internal queue base address.
* @test_queues: run simple test on all queues for sanity check.
- * @dma_pool_zalloc: small DMA allocation of coherent memory from DMA pool.
- * size of allocation is HL_DMA_POOL_BLK_SIZE.
- * @dma_pool_free: free small DMA allocation from pool.
+ * @asic_dma_pool_zalloc: small DMA allocation of coherent memory from DMA pool.
+ * size of allocation is HL_DMA_POOL_BLK_SIZE.
+ * @asic_dma_pool_free: free small DMA allocation from pool.
* @cpu_accessible_dma_pool_alloc: allocate CPU PQ packet from DMA pool.
* @cpu_accessible_dma_pool_free: free CPU PQ packet from DMA pool.
* @hl_dma_unmap_sg: DMA unmap scatter-gather list.
@@ -472,8 +486,7 @@ enum hl_pll_frequency {
* @mmu_invalidate_cache_range: flush specific MMU STLB cache lines with
* ASID-VA-size mask.
* @send_heartbeat: send is-alive packet to ArmCP and verify response.
- * @enable_clock_gating: enable clock gating for reducing power consumption.
- * @disable_clock_gating: disable clock for accessing registers on HBW.
+ * @debug_coresight: perform certain actions on Coresight for debugging.
* @is_device_idle: return true if device is idle, false otherwise.
* @soft_reset_late_init: perform certain actions needed after soft reset.
* @hw_queues_lock: acquire H/W queues lock.
@@ -482,6 +495,12 @@ enum hl_pll_frequency {
* @get_eeprom_data: retrieve EEPROM data from F/W.
* @send_cpu_message: send buffer to ArmCP.
* @get_hw_state: retrieve the H/W state
+ * @pci_bars_map: Map PCI BARs.
+ * @set_dram_bar_base: Set DRAM BAR to map specific device address. Returns
+ * old address the bar pointed to or U64_MAX for failure
+ * @init_iatu: Initialize the iATU unit inside the PCI controller.
+ * @rreg: Read a register. Needed for simulator support.
+ * @wreg: Write a register. Needed for simulator support.
*/
struct hl_asic_funcs {
int (*early_init)(struct hl_device *hdev);
@@ -499,27 +518,27 @@ struct hl_asic_funcs {
u64 kaddress, phys_addr_t paddress, u32 size);
void (*ring_doorbell)(struct hl_device *hdev, u32 hw_queue_id, u32 pi);
void (*flush_pq_write)(struct hl_device *hdev, u64 *pq, u64 exp_val);
- void* (*dma_alloc_coherent)(struct hl_device *hdev, size_t size,
+ void* (*asic_dma_alloc_coherent)(struct hl_device *hdev, size_t size,
dma_addr_t *dma_handle, gfp_t flag);
- void (*dma_free_coherent)(struct hl_device *hdev, size_t size,
+ void (*asic_dma_free_coherent)(struct hl_device *hdev, size_t size,
void *cpu_addr, dma_addr_t dma_handle);
void* (*get_int_queue_base)(struct hl_device *hdev, u32 queue_id,
dma_addr_t *dma_handle, u16 *queue_len);
int (*test_queues)(struct hl_device *hdev);
- void* (*dma_pool_zalloc)(struct hl_device *hdev, size_t size,
+ void* (*asic_dma_pool_zalloc)(struct hl_device *hdev, size_t size,
gfp_t mem_flags, dma_addr_t *dma_handle);
- void (*dma_pool_free)(struct hl_device *hdev, void *vaddr,
+ void (*asic_dma_pool_free)(struct hl_device *hdev, void *vaddr,
dma_addr_t dma_addr);
void* (*cpu_accessible_dma_pool_alloc)(struct hl_device *hdev,
size_t size, dma_addr_t *dma_handle);
void (*cpu_accessible_dma_pool_free)(struct hl_device *hdev,
size_t size, void *vaddr);
void (*hl_dma_unmap_sg)(struct hl_device *hdev,
- struct scatterlist *sg, int nents,
+ struct scatterlist *sgl, int nents,
enum dma_data_direction dir);
int (*cs_parser)(struct hl_device *hdev, struct hl_cs_parser *parser);
int (*asic_dma_map_sg)(struct hl_device *hdev,
- struct scatterlist *sg, int nents,
+ struct scatterlist *sgl, int nents,
enum dma_data_direction dir);
u32 (*get_dma_desc_list_size)(struct hl_device *hdev,
struct sg_table *sgt);
@@ -543,9 +562,8 @@ struct hl_asic_funcs {
void (*mmu_invalidate_cache_range)(struct hl_device *hdev, bool is_hard,
u32 asid, u64 va, u64 size);
int (*send_heartbeat)(struct hl_device *hdev);
- void (*enable_clock_gating)(struct hl_device *hdev);
- void (*disable_clock_gating)(struct hl_device *hdev);
- bool (*is_device_idle)(struct hl_device *hdev);
+ int (*debug_coresight)(struct hl_device *hdev, void *data);
+ bool (*is_device_idle)(struct hl_device *hdev, char *buf, size_t size);
int (*soft_reset_late_init)(struct hl_device *hdev);
void (*hw_queues_lock)(struct hl_device *hdev);
void (*hw_queues_unlock)(struct hl_device *hdev);
@@ -555,6 +573,11 @@ struct hl_asic_funcs {
int (*send_cpu_message)(struct hl_device *hdev, u32 *msg,
u16 len, u32 timeout, long *result);
enum hl_device_hw_state (*get_hw_state)(struct hl_device *hdev);
+ int (*pci_bars_map)(struct hl_device *hdev);
+ u64 (*set_dram_bar_base)(struct hl_device *hdev, u64 addr);
+ int (*init_iatu)(struct hl_device *hdev);
+ u32 (*rreg)(struct hl_device *hdev, u32 reg);
+ void (*wreg)(struct hl_device *hdev, u32 reg, u32 val);
};
@@ -582,7 +605,8 @@ struct hl_va_range {
* struct hl_ctx - user/kernel context.
* @mem_hash: holds mapping from virtual address to virtual memory area
* descriptor (hl_vm_phys_pg_list or hl_userptr).
- * @mmu_hash: holds a mapping from virtual address to pgt_info structure.
+ * @mmu_phys_hash: holds a mapping from physical address to pgt_info structure.
+ * @mmu_shadow_hash: holds a mapping from shadow address to pgt_info structure.
* @hpriv: pointer to the private (KMD) data of the process (fd).
* @hdev: pointer to the device structure.
* @refcount: reference counter for the context. Context is released only when
@@ -601,17 +625,19 @@ struct hl_va_range {
* DRAM mapping.
* @cs_lock: spinlock to protect cs_sequence.
* @dram_phys_mem: amount of used physical DRAM memory by this context.
- * @thread_restore_token: token to prevent multiple threads of the same context
- * from running the restore phase. Only one thread
- * should run it.
- * @thread_restore_wait_token: token to prevent the threads that didn't run
- * the restore phase from moving to their execution
- * phase before the restore phase has finished.
+ * @thread_ctx_switch_token: token to prevent multiple threads of the same
+ * context from running the context switch phase.
+ * Only a single thread should run it.
+ * @thread_ctx_switch_wait_token: token to prevent the threads that didn't run
+ * the context switch phase from moving to their
+ * execution phase before the context switch phase
+ * has finished.
* @asid: context's unique address space ID in the device's MMU.
*/
struct hl_ctx {
DECLARE_HASHTABLE(mem_hash, MEM_HASH_TABLE_BITS);
- DECLARE_HASHTABLE(mmu_hash, MMU_HASH_TABLE_BITS);
+ DECLARE_HASHTABLE(mmu_phys_hash, MMU_HASH_TABLE_BITS);
+ DECLARE_HASHTABLE(mmu_shadow_hash, MMU_HASH_TABLE_BITS);
struct hl_fpriv *hpriv;
struct hl_device *hdev;
struct kref refcount;
@@ -625,8 +651,8 @@ struct hl_ctx {
u64 *dram_default_hops;
spinlock_t cs_lock;
atomic64_t dram_phys_mem;
- atomic_t thread_restore_token;
- u32 thread_restore_wait_token;
+ atomic_t thread_ctx_switch_token;
+ u32 thread_ctx_switch_wait_token;
u32 asid;
};
@@ -753,8 +779,6 @@ struct hl_cs_job {
* @patched_cb_size: the size of the CB after parsing.
* @ext_queue: whether the job is for external queue or internal queue.
* @job_id: the id of the related job inside the related CS.
- * @use_virt_addr: whether to treat the addresses in the CB as virtual during
- * parsing.
*/
struct hl_cs_parser {
struct hl_cb *user_cb;
@@ -767,7 +791,6 @@ struct hl_cs_parser {
u32 patched_cb_size;
u8 ext_queue;
u8 job_id;
- u8 use_virt_addr;
};
@@ -850,6 +873,29 @@ struct hl_vm {
u8 init_done;
};
+
+/*
+ * DEBUG, PROFILING STRUCTURE
+ */
+
+/**
+ * struct hl_debug_params - Coresight debug parameters.
+ * @input: pointer to component specific input parameters.
+ * @output: pointer to component specific output parameters.
+ * @output_size: size of output buffer.
+ * @reg_idx: relevant register ID.
+ * @op: component operation to execute.
+ * @enable: true if to enable component debugging, false otherwise.
+ */
+struct hl_debug_params {
+ void *input;
+ void *output;
+ u32 output_size;
+ u32 reg_idx;
+ u32 op;
+ bool enable;
+};
+
/*
* FILE PRIVATE STRUCTURE
*/
@@ -973,13 +1019,10 @@ struct hl_dbg_device_entry {
u32 hl_rreg(struct hl_device *hdev, u32 reg);
void hl_wreg(struct hl_device *hdev, u32 reg, u32 val);
-#define hl_poll_timeout(hdev, addr, val, cond, sleep_us, timeout_us) \
- readl_poll_timeout(hdev->rmmio + addr, val, cond, sleep_us, timeout_us)
-
-#define RREG32(reg) hl_rreg(hdev, (reg))
-#define WREG32(reg, v) hl_wreg(hdev, (reg), (v))
+#define RREG32(reg) hdev->asic_funcs->rreg(hdev, (reg))
+#define WREG32(reg, v) hdev->asic_funcs->wreg(hdev, (reg), (v))
#define DREG32(reg) pr_info("REGISTER: " #reg " : 0x%08X\n", \
- hl_rreg(hdev, (reg)))
+ hdev->asic_funcs->rreg(hdev, (reg)))
#define WREG32_P(reg, val, mask) \
do { \
@@ -997,6 +1040,36 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val);
WREG32(mm##reg, (RREG32(mm##reg) & ~REG_FIELD_MASK(reg, field)) | \
(val) << REG_FIELD_SHIFT(reg, field))
+#define hl_poll_timeout(hdev, addr, val, cond, sleep_us, timeout_us) \
+({ \
+ ktime_t __timeout; \
+ /* timeout should be longer when working with simulator */ \
+ if (hdev->pdev) \
+ __timeout = ktime_add_us(ktime_get(), timeout_us); \
+ else \
+ __timeout = ktime_add_us(ktime_get(), (timeout_us * 10)); \
+ might_sleep_if(sleep_us); \
+ for (;;) { \
+ (val) = RREG32(addr); \
+ if (cond) \
+ break; \
+ if (timeout_us && ktime_compare(ktime_get(), __timeout) > 0) { \
+ (val) = RREG32(addr); \
+ break; \
+ } \
+ if (sleep_us) \
+ usleep_range((sleep_us >> 2) + 1, sleep_us); \
+ } \
+ (cond) ? 0 : -ETIMEDOUT; \
+})
+
+
+#define HL_ENG_BUSY(buf, size, fmt, ...) ({ \
+ if (buf) \
+ snprintf(buf, size, fmt, ##__VA_ARGS__); \
+ false; \
+ })
+
struct hwmon_chip_info;
/**
@@ -1047,7 +1120,8 @@ struct hl_device_reset_work {
* @asic_specific: ASIC specific information to use only from ASIC files.
* @mmu_pgt_pool: pool of available MMU hops.
* @vm: virtual memory manager for MMU.
- * @mmu_cache_lock: protects MMU cache invalidation as it can serve one context
+ * @mmu_cache_lock: protects MMU cache invalidation as it can serve one context.
+ * @mmu_shadow_hop0: shadow mapping of the MMU hop 0 zone.
* @hwmon_dev: H/W monitor device.
* @pm_mng_profile: current power management profile.
* @hl_chip_info: ASIC's sensors information.
@@ -1082,6 +1156,7 @@ struct hl_device_reset_work {
* @init_done: is the initialization of the device done.
* @mmu_enable: is MMU enabled.
* @device_cpu_disabled: is the device CPU disabled (due to timeouts)
+ * @dma_mask: the dma mask that was set for this device
*/
struct hl_device {
struct pci_dev *pdev;
@@ -1117,6 +1192,7 @@ struct hl_device {
struct gen_pool *mmu_pgt_pool;
struct hl_vm vm;
struct mutex mmu_cache_lock;
+ void *mmu_shadow_hop0;
struct device *hwmon_dev;
enum hl_pm_mng_profile pm_mng_profile;
struct hwmon_chip_info *hl_chip_info;
@@ -1151,6 +1227,7 @@ struct hl_device {
u8 dram_default_page_mapping;
u8 init_done;
u8 device_cpu_disabled;
+ u8 dma_mask;
/* Parameters for bring-up */
u8 mmu_enable;
@@ -1245,6 +1322,7 @@ static inline bool hl_mem_area_crosses_range(u64 address, u32 size,
int hl_device_open(struct inode *inode, struct file *filp);
bool hl_device_disabled_or_in_reset(struct hl_device *hdev);
+enum hl_device_status hl_device_status(struct hl_device *hdev);
int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
enum hl_asic_type asic_type, int minor);
void destroy_hdev(struct hl_device *hdev);
@@ -1351,6 +1429,32 @@ int hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, u32 page_size);
void hl_mmu_swap_out(struct hl_ctx *ctx);
void hl_mmu_swap_in(struct hl_ctx *ctx);
+int hl_fw_push_fw_to_device(struct hl_device *hdev, const char *fw_name,
+ void __iomem *dst);
+int hl_fw_send_pci_access_msg(struct hl_device *hdev, u32 opcode);
+int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
+ u16 len, u32 timeout, long *result);
+int hl_fw_test_cpu_queue(struct hl_device *hdev);
+void *hl_fw_cpu_accessible_dma_pool_alloc(struct hl_device *hdev, size_t size,
+ dma_addr_t *dma_handle);
+void hl_fw_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size,
+ void *vaddr);
+int hl_fw_send_heartbeat(struct hl_device *hdev);
+int hl_fw_armcp_info_get(struct hl_device *hdev);
+int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size);
+
+int hl_pci_bars_map(struct hl_device *hdev, const char * const name[3],
+ bool is_wc[3]);
+int hl_pci_iatu_write(struct hl_device *hdev, u32 addr, u32 data);
+int hl_pci_set_dram_bar_base(struct hl_device *hdev, u8 inbound_region, u8 bar,
+ u64 addr);
+int hl_pci_init_iatu(struct hl_device *hdev, u64 sram_base_address,
+ u64 dram_base_address, u64 host_phys_base_address,
+ u64 host_phys_size);
+int hl_pci_init(struct hl_device *hdev, u8 dma_mask);
+void hl_pci_fini(struct hl_device *hdev);
+int hl_pci_set_dma_mask(struct hl_device *hdev, u8 dma_mask);
+
long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr);
void hl_set_frequency(struct hl_device *hdev, u32 pll_index, u64 freq);
long hl_get_temperature(struct hl_device *hdev, int sensor_index, u32 attr);