diff options
Diffstat (limited to 'drivers/misc/habanalabs/habanalabs.h')
-rw-r--r-- | drivers/misc/habanalabs/habanalabs.h | 220 |
1 files changed, 162 insertions, 58 deletions
diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index a8ee52c880cd..71243b319920 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -11,8 +11,6 @@ #include "include/armcp_if.h" #include "include/qman_if.h" -#define pr_fmt(fmt) "habanalabs: " fmt - #include <linux/cdev.h> #include <linux/iopoll.h> #include <linux/irqreturn.h> @@ -33,6 +31,9 @@ #define HL_PLL_LOW_JOB_FREQ_USEC 5000000 /* 5 s */ +#define HL_ARMCP_INFO_TIMEOUT_USEC 10000000 /* 10s */ +#define HL_ARMCP_EEPROM_TIMEOUT_USEC 10000000 /* 10s */ + #define HL_MAX_QUEUES 128 #define HL_MAX_JOBS_PER_CS 64 @@ -48,8 +49,9 @@ /** * struct pgt_info - MMU hop page info. - * @node: hash linked-list node for the pgts hash of pgts. - * @addr: physical address of the pgt. + * @node: hash linked-list node for the pgts shadow hash of pgts. + * @phys_addr: physical address of the pgt. + * @shadow_addr: shadow hop in the host. * @ctx: pointer to the owner ctx. * @num_of_ptes: indicates how many ptes are used in the pgt. * @@ -59,10 +61,11 @@ * page, it is freed with its pgt_info structure. */ struct pgt_info { - struct hlist_node node; - u64 addr; - struct hl_ctx *ctx; - int num_of_ptes; + struct hlist_node node; + u64 phys_addr; + u64 shadow_addr; + struct hl_ctx *ctx; + int num_of_ptes; }; struct hl_device; @@ -132,8 +135,6 @@ enum hl_device_hw_state { * @dram_user_base_address: DRAM physical start address for user access. * @dram_size: DRAM total size. * @dram_pci_bar_size: size of PCI bar towards DRAM. - * @host_phys_base_address: base physical address of host memory for - * transactions that the device generates. * @max_power_default: max power of the device after reset * @va_space_host_start_address: base address of virtual memory range for * mapping host memory. @@ -145,6 +146,8 @@ enum hl_device_hw_state { * mapping DRAM memory. * @dram_size_for_default_page_mapping: DRAM size needed to map to avoid page * fault. + * @pcie_dbi_base_address: Base address of the PCIE_DBI block. + * @pcie_aux_dbi_reg_addr: Address of the PCIE_AUX DBI register. * @mmu_pgt_addr: base physical address in DRAM of MMU page tables. * @mmu_dram_default_page_addr: DRAM default page physical address. * @mmu_pgt_size: MMU page tables total size. @@ -179,13 +182,14 @@ struct asic_fixed_properties { u64 dram_user_base_address; u64 dram_size; u64 dram_pci_bar_size; - u64 host_phys_base_address; u64 max_power_default; u64 va_space_host_start_address; u64 va_space_host_end_address; u64 va_space_dram_start_address; u64 va_space_dram_end_address; u64 dram_size_for_default_page_mapping; + u64 pcie_dbi_base_address; + u64 pcie_aux_dbi_reg_addr; u64 mmu_pgt_addr; u64 mmu_dram_default_page_addr; u32 mmu_pgt_size; @@ -314,6 +318,18 @@ struct hl_cs_job; #define HL_EQ_LENGTH 64 #define HL_EQ_SIZE_IN_BYTES (HL_EQ_LENGTH * HL_EQ_ENTRY_SIZE) +#define HL_CPU_PKT_SHIFT 5 +#define HL_CPU_PKT_SIZE (1 << HL_CPU_PKT_SHIFT) +#define HL_CPU_PKT_MASK (~((1 << HL_CPU_PKT_SHIFT) - 1)) +#define HL_CPU_MAX_PKTS_IN_CB 32 +#define HL_CPU_CB_SIZE (HL_CPU_PKT_SIZE * \ + HL_CPU_MAX_PKTS_IN_CB) +#define HL_CPU_CB_QUEUE_SIZE (HL_QUEUE_LENGTH * HL_CPU_CB_SIZE) + +/* KMD <-> ArmCP shared memory size (EQ + PQ + CPU CB queue) */ +#define HL_CPU_ACCESSIBLE_MEM_SIZE (HL_EQ_SIZE_IN_BYTES + \ + HL_QUEUE_SIZE_IN_BYTES + \ + HL_CPU_CB_QUEUE_SIZE) /** * struct hl_hw_queue - describes a H/W transport queue. @@ -381,14 +397,12 @@ struct hl_eq { /** * enum hl_asic_type - supported ASIC types. - * @ASIC_AUTO_DETECT: ASIC type will be automatically set. - * @ASIC_GOYA: Goya device. * @ASIC_INVALID: Invalid ASIC type. + * @ASIC_GOYA: Goya device. */ enum hl_asic_type { - ASIC_AUTO_DETECT, - ASIC_GOYA, - ASIC_INVALID + ASIC_INVALID, + ASIC_GOYA }; struct hl_cs_parser; @@ -436,19 +450,19 @@ enum hl_pll_frequency { * @cb_mmap: maps a CB. * @ring_doorbell: increment PI on a given QMAN. * @flush_pq_write: flush PQ entry write if necessary, WARN if flushing failed. - * @dma_alloc_coherent: Allocate coherent DMA memory by calling - * dma_alloc_coherent(). This is ASIC function because its - * implementation is not trivial when the driver is loaded - * in simulation mode (not upstreamed). - * @dma_free_coherent: Free coherent DMA memory by calling dma_free_coherent(). - * This is ASIC function because its implementation is not - * trivial when the driver is loaded in simulation mode - * (not upstreamed). + * @asic_dma_alloc_coherent: Allocate coherent DMA memory by calling + * dma_alloc_coherent(). This is ASIC function because + * its implementation is not trivial when the driver + * is loaded in simulation mode (not upstreamed). + * @asic_dma_free_coherent: Free coherent DMA memory by calling + * dma_free_coherent(). This is ASIC function because + * its implementation is not trivial when the driver + * is loaded in simulation mode (not upstreamed). * @get_int_queue_base: get the internal queue base address. * @test_queues: run simple test on all queues for sanity check. - * @dma_pool_zalloc: small DMA allocation of coherent memory from DMA pool. - * size of allocation is HL_DMA_POOL_BLK_SIZE. - * @dma_pool_free: free small DMA allocation from pool. + * @asic_dma_pool_zalloc: small DMA allocation of coherent memory from DMA pool. + * size of allocation is HL_DMA_POOL_BLK_SIZE. + * @asic_dma_pool_free: free small DMA allocation from pool. * @cpu_accessible_dma_pool_alloc: allocate CPU PQ packet from DMA pool. * @cpu_accessible_dma_pool_free: free CPU PQ packet from DMA pool. * @hl_dma_unmap_sg: DMA unmap scatter-gather list. @@ -472,8 +486,7 @@ enum hl_pll_frequency { * @mmu_invalidate_cache_range: flush specific MMU STLB cache lines with * ASID-VA-size mask. * @send_heartbeat: send is-alive packet to ArmCP and verify response. - * @enable_clock_gating: enable clock gating for reducing power consumption. - * @disable_clock_gating: disable clock for accessing registers on HBW. + * @debug_coresight: perform certain actions on Coresight for debugging. * @is_device_idle: return true if device is idle, false otherwise. * @soft_reset_late_init: perform certain actions needed after soft reset. * @hw_queues_lock: acquire H/W queues lock. @@ -482,6 +495,12 @@ enum hl_pll_frequency { * @get_eeprom_data: retrieve EEPROM data from F/W. * @send_cpu_message: send buffer to ArmCP. * @get_hw_state: retrieve the H/W state + * @pci_bars_map: Map PCI BARs. + * @set_dram_bar_base: Set DRAM BAR to map specific device address. Returns + * old address the bar pointed to or U64_MAX for failure + * @init_iatu: Initialize the iATU unit inside the PCI controller. + * @rreg: Read a register. Needed for simulator support. + * @wreg: Write a register. Needed for simulator support. */ struct hl_asic_funcs { int (*early_init)(struct hl_device *hdev); @@ -499,27 +518,27 @@ struct hl_asic_funcs { u64 kaddress, phys_addr_t paddress, u32 size); void (*ring_doorbell)(struct hl_device *hdev, u32 hw_queue_id, u32 pi); void (*flush_pq_write)(struct hl_device *hdev, u64 *pq, u64 exp_val); - void* (*dma_alloc_coherent)(struct hl_device *hdev, size_t size, + void* (*asic_dma_alloc_coherent)(struct hl_device *hdev, size_t size, dma_addr_t *dma_handle, gfp_t flag); - void (*dma_free_coherent)(struct hl_device *hdev, size_t size, + void (*asic_dma_free_coherent)(struct hl_device *hdev, size_t size, void *cpu_addr, dma_addr_t dma_handle); void* (*get_int_queue_base)(struct hl_device *hdev, u32 queue_id, dma_addr_t *dma_handle, u16 *queue_len); int (*test_queues)(struct hl_device *hdev); - void* (*dma_pool_zalloc)(struct hl_device *hdev, size_t size, + void* (*asic_dma_pool_zalloc)(struct hl_device *hdev, size_t size, gfp_t mem_flags, dma_addr_t *dma_handle); - void (*dma_pool_free)(struct hl_device *hdev, void *vaddr, + void (*asic_dma_pool_free)(struct hl_device *hdev, void *vaddr, dma_addr_t dma_addr); void* (*cpu_accessible_dma_pool_alloc)(struct hl_device *hdev, size_t size, dma_addr_t *dma_handle); void (*cpu_accessible_dma_pool_free)(struct hl_device *hdev, size_t size, void *vaddr); void (*hl_dma_unmap_sg)(struct hl_device *hdev, - struct scatterlist *sg, int nents, + struct scatterlist *sgl, int nents, enum dma_data_direction dir); int (*cs_parser)(struct hl_device *hdev, struct hl_cs_parser *parser); int (*asic_dma_map_sg)(struct hl_device *hdev, - struct scatterlist *sg, int nents, + struct scatterlist *sgl, int nents, enum dma_data_direction dir); u32 (*get_dma_desc_list_size)(struct hl_device *hdev, struct sg_table *sgt); @@ -543,9 +562,8 @@ struct hl_asic_funcs { void (*mmu_invalidate_cache_range)(struct hl_device *hdev, bool is_hard, u32 asid, u64 va, u64 size); int (*send_heartbeat)(struct hl_device *hdev); - void (*enable_clock_gating)(struct hl_device *hdev); - void (*disable_clock_gating)(struct hl_device *hdev); - bool (*is_device_idle)(struct hl_device *hdev); + int (*debug_coresight)(struct hl_device *hdev, void *data); + bool (*is_device_idle)(struct hl_device *hdev, char *buf, size_t size); int (*soft_reset_late_init)(struct hl_device *hdev); void (*hw_queues_lock)(struct hl_device *hdev); void (*hw_queues_unlock)(struct hl_device *hdev); @@ -555,6 +573,11 @@ struct hl_asic_funcs { int (*send_cpu_message)(struct hl_device *hdev, u32 *msg, u16 len, u32 timeout, long *result); enum hl_device_hw_state (*get_hw_state)(struct hl_device *hdev); + int (*pci_bars_map)(struct hl_device *hdev); + u64 (*set_dram_bar_base)(struct hl_device *hdev, u64 addr); + int (*init_iatu)(struct hl_device *hdev); + u32 (*rreg)(struct hl_device *hdev, u32 reg); + void (*wreg)(struct hl_device *hdev, u32 reg, u32 val); }; @@ -582,7 +605,8 @@ struct hl_va_range { * struct hl_ctx - user/kernel context. * @mem_hash: holds mapping from virtual address to virtual memory area * descriptor (hl_vm_phys_pg_list or hl_userptr). - * @mmu_hash: holds a mapping from virtual address to pgt_info structure. + * @mmu_phys_hash: holds a mapping from physical address to pgt_info structure. + * @mmu_shadow_hash: holds a mapping from shadow address to pgt_info structure. * @hpriv: pointer to the private (KMD) data of the process (fd). * @hdev: pointer to the device structure. * @refcount: reference counter for the context. Context is released only when @@ -601,17 +625,19 @@ struct hl_va_range { * DRAM mapping. * @cs_lock: spinlock to protect cs_sequence. * @dram_phys_mem: amount of used physical DRAM memory by this context. - * @thread_restore_token: token to prevent multiple threads of the same context - * from running the restore phase. Only one thread - * should run it. - * @thread_restore_wait_token: token to prevent the threads that didn't run - * the restore phase from moving to their execution - * phase before the restore phase has finished. + * @thread_ctx_switch_token: token to prevent multiple threads of the same + * context from running the context switch phase. + * Only a single thread should run it. + * @thread_ctx_switch_wait_token: token to prevent the threads that didn't run + * the context switch phase from moving to their + * execution phase before the context switch phase + * has finished. * @asid: context's unique address space ID in the device's MMU. */ struct hl_ctx { DECLARE_HASHTABLE(mem_hash, MEM_HASH_TABLE_BITS); - DECLARE_HASHTABLE(mmu_hash, MMU_HASH_TABLE_BITS); + DECLARE_HASHTABLE(mmu_phys_hash, MMU_HASH_TABLE_BITS); + DECLARE_HASHTABLE(mmu_shadow_hash, MMU_HASH_TABLE_BITS); struct hl_fpriv *hpriv; struct hl_device *hdev; struct kref refcount; @@ -625,8 +651,8 @@ struct hl_ctx { u64 *dram_default_hops; spinlock_t cs_lock; atomic64_t dram_phys_mem; - atomic_t thread_restore_token; - u32 thread_restore_wait_token; + atomic_t thread_ctx_switch_token; + u32 thread_ctx_switch_wait_token; u32 asid; }; @@ -753,8 +779,6 @@ struct hl_cs_job { * @patched_cb_size: the size of the CB after parsing. * @ext_queue: whether the job is for external queue or internal queue. * @job_id: the id of the related job inside the related CS. - * @use_virt_addr: whether to treat the addresses in the CB as virtual during - * parsing. */ struct hl_cs_parser { struct hl_cb *user_cb; @@ -767,7 +791,6 @@ struct hl_cs_parser { u32 patched_cb_size; u8 ext_queue; u8 job_id; - u8 use_virt_addr; }; @@ -850,6 +873,29 @@ struct hl_vm { u8 init_done; }; + +/* + * DEBUG, PROFILING STRUCTURE + */ + +/** + * struct hl_debug_params - Coresight debug parameters. + * @input: pointer to component specific input parameters. + * @output: pointer to component specific output parameters. + * @output_size: size of output buffer. + * @reg_idx: relevant register ID. + * @op: component operation to execute. + * @enable: true if to enable component debugging, false otherwise. + */ +struct hl_debug_params { + void *input; + void *output; + u32 output_size; + u32 reg_idx; + u32 op; + bool enable; +}; + /* * FILE PRIVATE STRUCTURE */ @@ -973,13 +1019,10 @@ struct hl_dbg_device_entry { u32 hl_rreg(struct hl_device *hdev, u32 reg); void hl_wreg(struct hl_device *hdev, u32 reg, u32 val); -#define hl_poll_timeout(hdev, addr, val, cond, sleep_us, timeout_us) \ - readl_poll_timeout(hdev->rmmio + addr, val, cond, sleep_us, timeout_us) - -#define RREG32(reg) hl_rreg(hdev, (reg)) -#define WREG32(reg, v) hl_wreg(hdev, (reg), (v)) +#define RREG32(reg) hdev->asic_funcs->rreg(hdev, (reg)) +#define WREG32(reg, v) hdev->asic_funcs->wreg(hdev, (reg), (v)) #define DREG32(reg) pr_info("REGISTER: " #reg " : 0x%08X\n", \ - hl_rreg(hdev, (reg))) + hdev->asic_funcs->rreg(hdev, (reg))) #define WREG32_P(reg, val, mask) \ do { \ @@ -997,6 +1040,36 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val); WREG32(mm##reg, (RREG32(mm##reg) & ~REG_FIELD_MASK(reg, field)) | \ (val) << REG_FIELD_SHIFT(reg, field)) +#define hl_poll_timeout(hdev, addr, val, cond, sleep_us, timeout_us) \ +({ \ + ktime_t __timeout; \ + /* timeout should be longer when working with simulator */ \ + if (hdev->pdev) \ + __timeout = ktime_add_us(ktime_get(), timeout_us); \ + else \ + __timeout = ktime_add_us(ktime_get(), (timeout_us * 10)); \ + might_sleep_if(sleep_us); \ + for (;;) { \ + (val) = RREG32(addr); \ + if (cond) \ + break; \ + if (timeout_us && ktime_compare(ktime_get(), __timeout) > 0) { \ + (val) = RREG32(addr); \ + break; \ + } \ + if (sleep_us) \ + usleep_range((sleep_us >> 2) + 1, sleep_us); \ + } \ + (cond) ? 0 : -ETIMEDOUT; \ +}) + + +#define HL_ENG_BUSY(buf, size, fmt, ...) ({ \ + if (buf) \ + snprintf(buf, size, fmt, ##__VA_ARGS__); \ + false; \ + }) + struct hwmon_chip_info; /** @@ -1047,7 +1120,8 @@ struct hl_device_reset_work { * @asic_specific: ASIC specific information to use only from ASIC files. * @mmu_pgt_pool: pool of available MMU hops. * @vm: virtual memory manager for MMU. - * @mmu_cache_lock: protects MMU cache invalidation as it can serve one context + * @mmu_cache_lock: protects MMU cache invalidation as it can serve one context. + * @mmu_shadow_hop0: shadow mapping of the MMU hop 0 zone. * @hwmon_dev: H/W monitor device. * @pm_mng_profile: current power management profile. * @hl_chip_info: ASIC's sensors information. @@ -1082,6 +1156,7 @@ struct hl_device_reset_work { * @init_done: is the initialization of the device done. * @mmu_enable: is MMU enabled. * @device_cpu_disabled: is the device CPU disabled (due to timeouts) + * @dma_mask: the dma mask that was set for this device */ struct hl_device { struct pci_dev *pdev; @@ -1117,6 +1192,7 @@ struct hl_device { struct gen_pool *mmu_pgt_pool; struct hl_vm vm; struct mutex mmu_cache_lock; + void *mmu_shadow_hop0; struct device *hwmon_dev; enum hl_pm_mng_profile pm_mng_profile; struct hwmon_chip_info *hl_chip_info; @@ -1151,6 +1227,7 @@ struct hl_device { u8 dram_default_page_mapping; u8 init_done; u8 device_cpu_disabled; + u8 dma_mask; /* Parameters for bring-up */ u8 mmu_enable; @@ -1245,6 +1322,7 @@ static inline bool hl_mem_area_crosses_range(u64 address, u32 size, int hl_device_open(struct inode *inode, struct file *filp); bool hl_device_disabled_or_in_reset(struct hl_device *hdev); +enum hl_device_status hl_device_status(struct hl_device *hdev); int create_hdev(struct hl_device **dev, struct pci_dev *pdev, enum hl_asic_type asic_type, int minor); void destroy_hdev(struct hl_device *hdev); @@ -1351,6 +1429,32 @@ int hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, u32 page_size); void hl_mmu_swap_out(struct hl_ctx *ctx); void hl_mmu_swap_in(struct hl_ctx *ctx); +int hl_fw_push_fw_to_device(struct hl_device *hdev, const char *fw_name, + void __iomem *dst); +int hl_fw_send_pci_access_msg(struct hl_device *hdev, u32 opcode); +int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg, + u16 len, u32 timeout, long *result); +int hl_fw_test_cpu_queue(struct hl_device *hdev); +void *hl_fw_cpu_accessible_dma_pool_alloc(struct hl_device *hdev, size_t size, + dma_addr_t *dma_handle); +void hl_fw_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size, + void *vaddr); +int hl_fw_send_heartbeat(struct hl_device *hdev); +int hl_fw_armcp_info_get(struct hl_device *hdev); +int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size); + +int hl_pci_bars_map(struct hl_device *hdev, const char * const name[3], + bool is_wc[3]); +int hl_pci_iatu_write(struct hl_device *hdev, u32 addr, u32 data); +int hl_pci_set_dram_bar_base(struct hl_device *hdev, u8 inbound_region, u8 bar, + u64 addr); +int hl_pci_init_iatu(struct hl_device *hdev, u64 sram_base_address, + u64 dram_base_address, u64 host_phys_base_address, + u64 host_phys_size); +int hl_pci_init(struct hl_device *hdev, u8 dma_mask); +void hl_pci_fini(struct hl_device *hdev); +int hl_pci_set_dma_mask(struct hl_device *hdev, u8 dma_mask); + long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr); void hl_set_frequency(struct hl_device *hdev, u32 pll_index, u64 freq); long hl_get_temperature(struct hl_device *hdev, int sensor_index, u32 attr); |