summaryrefslogtreecommitdiff
path: root/drivers/misc/habanalabs/common/habanalabs.h
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/misc/habanalabs/common/habanalabs.h')
-rw-r--r--drivers/misc/habanalabs/common/habanalabs.h280
1 files changed, 245 insertions, 35 deletions
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 6579f8767abd..6b3cdd7e068a 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -48,6 +48,7 @@
#define HL_PENDING_RESET_LONG_SEC 60
#define HL_HARD_RESET_MAX_TIMEOUT 120
+#define HL_PLDM_HARD_RESET_MAX_TIMEOUT (HL_HARD_RESET_MAX_TIMEOUT * 3)
#define HL_DEVICE_TIMEOUT_USEC 1000000 /* 1 s */
@@ -115,10 +116,18 @@ enum hl_mmu_page_table_location {
*
* - HL_RESET_HEARTBEAT
* Set if reset is due to heartbeat
+ *
+ * - HL_RESET_TDR
+ * Set if reset is due to TDR
+ *
+ * - HL_RESET_DEVICE_RELEASE
+ * Set if reset is due to device release
*/
#define HL_RESET_HARD (1 << 0)
#define HL_RESET_FROM_RESET_THREAD (1 << 1)
#define HL_RESET_HEARTBEAT (1 << 2)
+#define HL_RESET_TDR (1 << 3)
+#define HL_RESET_DEVICE_RELEASE (1 << 4)
#define HL_MAX_SOBS_PER_MONITOR 8
@@ -178,12 +187,14 @@ enum hl_pci_match_mode {
/**
* enum hl_fw_component - F/W components to read version through registers.
- * @FW_COMP_UBOOT: u-boot.
+ * @FW_COMP_BOOT_FIT: boot fit.
* @FW_COMP_PREBOOT: preboot.
+ * @FW_COMP_LINUX: linux.
*/
enum hl_fw_component {
- FW_COMP_UBOOT,
- FW_COMP_PREBOOT
+ FW_COMP_BOOT_FIT,
+ FW_COMP_PREBOOT,
+ FW_COMP_LINUX,
};
/**
@@ -420,12 +431,24 @@ struct hl_mmu_properties {
* @cb_pool_cb_size: size of each CB in the CB pool.
* @max_pending_cs: maximum of concurrent pending command submissions
* @max_queues: maximum amount of queues in the system
- * @fw_boot_cpu_security_map: bitmap representation of boot cpu security status
- * reported by FW, bit description can be found in
- * CPU_BOOT_DEV_STS*
- * @fw_app_security_map: bitmap representation of application security status
- * reported by FW, bit description can be found in
- * CPU_BOOT_DEV_STS*
+ * @fw_preboot_cpu_boot_dev_sts0: bitmap representation of preboot cpu
+ * capabilities reported by FW, bit description
+ * can be found in CPU_BOOT_DEV_STS0
+ * @fw_preboot_cpu_boot_dev_sts1: bitmap representation of preboot cpu
+ * capabilities reported by FW, bit description
+ * can be found in CPU_BOOT_DEV_STS1
+ * @fw_bootfit_cpu_boot_dev_sts0: bitmap representation of boot cpu security
+ * status reported by FW, bit description can be
+ * found in CPU_BOOT_DEV_STS0
+ * @fw_bootfit_cpu_boot_dev_sts1: bitmap representation of boot cpu security
+ * status reported by FW, bit description can be
+ * found in CPU_BOOT_DEV_STS1
+ * @fw_app_cpu_boot_dev_sts0: bitmap representation of application security
+ * status reported by FW, bit description can be
+ * found in CPU_BOOT_DEV_STS0
+ * @fw_app_cpu_boot_dev_sts1: bitmap representation of application security
+ * status reported by FW, bit description can be
+ * found in CPU_BOOT_DEV_STS1
* @collective_first_sob: first sync object available for collective use
* @collective_first_mon: first monitor available for collective use
* @sync_stream_first_sob: first sync object available for sync stream use
@@ -438,14 +461,19 @@ struct hl_mmu_properties {
* @user_interrupt_count: number of user interrupts.
* @tpc_enabled_mask: which TPCs are enabled.
* @completion_queues_count: number of completion queues.
- * @fw_security_disabled: true if security measures are disabled in firmware,
- * false otherwise
- * @fw_security_status_valid: security status bits are valid and can be fetched
- * from BOOT_DEV_STS0
+ * @fw_security_enabled: true if security measures are enabled in firmware,
+ * false otherwise
+ * @fw_cpu_boot_dev_sts0_valid: status bits are valid and can be fetched from
+ * BOOT_DEV_STS0
+ * @fw_cpu_boot_dev_sts1_valid: status bits are valid and can be fetched from
+ * BOOT_DEV_STS1
* @dram_supports_virtual_memory: is there an MMU towards the DRAM
* @hard_reset_done_by_fw: true if firmware is handling hard reset flow
* @num_functional_hbms: number of functional HBMs in each DCORE.
* @iatu_done_by_fw: true if iATU configuration is being done by FW.
+ * @dynamic_fw_load: is dynamic FW load is supported.
+ * @gic_interrupts_enable: true if FW is not blocking GIC controller,
+ * false otherwise.
*/
struct asic_fixed_properties {
struct hw_queue_properties *hw_queues_props;
@@ -491,8 +519,12 @@ struct asic_fixed_properties {
u32 cb_pool_cb_size;
u32 max_pending_cs;
u32 max_queues;
- u32 fw_boot_cpu_security_map;
- u32 fw_app_security_map;
+ u32 fw_preboot_cpu_boot_dev_sts0;
+ u32 fw_preboot_cpu_boot_dev_sts1;
+ u32 fw_bootfit_cpu_boot_dev_sts0;
+ u32 fw_bootfit_cpu_boot_dev_sts1;
+ u32 fw_app_cpu_boot_dev_sts0;
+ u32 fw_app_cpu_boot_dev_sts1;
u16 collective_first_sob;
u16 collective_first_mon;
u16 sync_stream_first_sob;
@@ -504,12 +536,15 @@ struct asic_fixed_properties {
u16 user_interrupt_count;
u8 tpc_enabled_mask;
u8 completion_queues_count;
- u8 fw_security_disabled;
- u8 fw_security_status_valid;
+ u8 fw_security_enabled;
+ u8 fw_cpu_boot_dev_sts0_valid;
+ u8 fw_cpu_boot_dev_sts1_valid;
u8 dram_supports_virtual_memory;
u8 hard_reset_done_by_fw;
u8 num_functional_hbms;
u8 iatu_done_by_fw;
+ u8 dynamic_fw_load;
+ u8 gic_interrupts_enable;
};
/**
@@ -750,12 +785,19 @@ struct hl_user_pending_interrupt {
* @kernel_address: holds the queue's kernel virtual address
* @bus_address: holds the queue's DMA address
* @ci: ci inside the queue
+ * @prev_eqe_index: the index of the previous event queue entry. The index of
+ * the current entry's index must be +1 of the previous one.
+ * @check_eqe_index: do we need to check the index of the current entry vs. the
+ * previous one. This is for backward compatibility with older
+ * firmwares
*/
struct hl_eq {
struct hl_device *hdev;
void *kernel_address;
dma_addr_t bus_address;
u32 ci;
+ u32 prev_eqe_index;
+ bool check_eqe_index;
};
@@ -812,6 +854,132 @@ enum div_select_defs {
DIV_SEL_DIVIDED_PLL = 3,
};
+enum pci_region {
+ PCI_REGION_CFG,
+ PCI_REGION_SRAM,
+ PCI_REGION_DRAM,
+ PCI_REGION_SP_SRAM,
+ PCI_REGION_NUMBER,
+};
+
+/**
+ * struct pci_mem_region - describe memory region in a PCI bar
+ * @region_base: region base address
+ * @region_size: region size
+ * @bar_size: size of the BAR
+ * @offset_in_bar: region offset into the bar
+ * @bar_id: bar ID of the region
+ * @used: if used 1, otherwise 0
+ */
+struct pci_mem_region {
+ u64 region_base;
+ u64 region_size;
+ u64 bar_size;
+ u32 offset_in_bar;
+ u8 bar_id;
+ u8 used;
+};
+
+/**
+ * struct static_fw_load_mgr - static FW load manager
+ * @preboot_version_max_off: max offset to preboot version
+ * @boot_fit_version_max_off: max offset to boot fit version
+ * @kmd_msg_to_cpu_reg: register address for KDM->CPU messages
+ * @cpu_cmd_status_to_host_reg: register address for CPU command status response
+ * @cpu_boot_status_reg: boot status register
+ * @cpu_boot_dev_status0_reg: boot device status register 0
+ * @cpu_boot_dev_status1_reg: boot device status register 1
+ * @boot_err0_reg: boot error register 0
+ * @boot_err1_reg: boot error register 1
+ * @preboot_version_offset_reg: SRAM offset to preboot version register
+ * @boot_fit_version_offset_reg: SRAM offset to boot fit version register
+ * @sram_offset_mask: mask for getting offset into the SRAM
+ * @cpu_reset_wait_msec: used when setting WFE via kmd_msg_to_cpu_reg
+ */
+struct static_fw_load_mgr {
+ u64 preboot_version_max_off;
+ u64 boot_fit_version_max_off;
+ u32 kmd_msg_to_cpu_reg;
+ u32 cpu_cmd_status_to_host_reg;
+ u32 cpu_boot_status_reg;
+ u32 cpu_boot_dev_status0_reg;
+ u32 cpu_boot_dev_status1_reg;
+ u32 boot_err0_reg;
+ u32 boot_err1_reg;
+ u32 preboot_version_offset_reg;
+ u32 boot_fit_version_offset_reg;
+ u32 sram_offset_mask;
+ u32 cpu_reset_wait_msec;
+};
+
+/**
+ * struct fw_response - FW response to LKD command
+ * @ram_offset: descriptor offset into the RAM
+ * @ram_type: RAM type containing the descriptor (SRAM/DRAM)
+ * @status: command status
+ */
+struct fw_response {
+ u32 ram_offset;
+ u8 ram_type;
+ u8 status;
+};
+
+/**
+ * struct dynamic_fw_load_mgr - dynamic FW load manager
+ * @response: FW to LKD response
+ * @comm_desc: the communication descriptor with FW
+ * @image_region: region to copy the FW image to
+ * @fw_image_size: size of FW image to load
+ * @wait_for_bl_timeout: timeout for waiting for boot loader to respond
+ */
+struct dynamic_fw_load_mgr {
+ struct fw_response response;
+ struct lkd_fw_comms_desc comm_desc;
+ struct pci_mem_region *image_region;
+ size_t fw_image_size;
+ u32 wait_for_bl_timeout;
+};
+
+/**
+ * struct fw_image_props - properties of FW image
+ * @image_name: name of the image
+ * @src_off: offset in src FW to copy from
+ * @copy_size: amount of bytes to copy (0 to copy the whole binary)
+ */
+struct fw_image_props {
+ char *image_name;
+ u32 src_off;
+ u32 copy_size;
+};
+
+/**
+ * struct fw_load_mgr - manager FW loading process
+ * @dynamic_loader: specific structure for dynamic load
+ * @static_loader: specific structure for static load
+ * @boot_fit_img: boot fit image properties
+ * @linux_img: linux image properties
+ * @cpu_timeout: CPU response timeout in usec
+ * @boot_fit_timeout: Boot fit load timeout in usec
+ * @skip_bmc: should BMC be skipped
+ * @sram_bar_id: SRAM bar ID
+ * @dram_bar_id: DRAM bar ID
+ * @linux_loaded: true if linux was loaded so far
+ */
+struct fw_load_mgr {
+ union {
+ struct dynamic_fw_load_mgr dynamic_loader;
+ struct static_fw_load_mgr static_loader;
+ };
+ struct fw_image_props boot_fit_img;
+ struct fw_image_props linux_img;
+ u32 cpu_timeout;
+ u32 boot_fit_timeout;
+ u8 skip_bmc;
+ u8 sram_bar_id;
+ u8 dram_bar_id;
+ u8 linux_loaded;
+};
+
/**
* struct hl_asic_funcs - ASIC specific functions that are can be called from
* common code.
@@ -901,8 +1069,6 @@ enum div_select_defs {
* @ctx_fini: context dependent cleanup.
* @get_clk_rate: Retrieve the ASIC current and maximum clock rate in MHz
* @get_queue_id_for_cq: Get the H/W queue id related to the given CQ index.
- * @read_device_fw_version: read the device's firmware versions that are
- * contained in registers
* @load_firmware_to_device: load the firmware to the device's memory
* @load_boot_fit_to_device: load boot fit to device's memory
* @get_signal_cb_size: Get signal CB size.
@@ -933,6 +1099,8 @@ enum div_select_defs {
* @get_msi_info: Retrieve asic-specific MSI ID of the f/w async event
* @map_pll_idx_to_fw_idx: convert driver specific per asic PLL index to
* generic f/w compatible PLL Indexes
+ * @init_firmware_loader: initialize data for FW loader.
+ * @init_cpu_scrambler_dram: Enable CPU specific DRAM scrambling
*/
struct hl_asic_funcs {
int (*early_init)(struct hl_device *hdev);
@@ -1006,7 +1174,7 @@ struct hl_asic_funcs {
int (*mmu_invalidate_cache)(struct hl_device *hdev, bool is_hard,
u32 flags);
int (*mmu_invalidate_cache_range)(struct hl_device *hdev, bool is_hard,
- u32 asid, u64 va, u64 size);
+ u32 flags, u32 asid, u64 va, u64 size);
int (*send_heartbeat)(struct hl_device *hdev);
void (*set_clock_gating)(struct hl_device *hdev);
void (*disable_clock_gating)(struct hl_device *hdev);
@@ -1030,8 +1198,6 @@ struct hl_asic_funcs {
void (*ctx_fini)(struct hl_ctx *ctx);
int (*get_clk_rate)(struct hl_device *hdev, u32 *cur_clk, u32 *max_clk);
u32 (*get_queue_id_for_cq)(struct hl_device *hdev, u32 cq_idx);
- int (*read_device_fw_version)(struct hl_device *hdev,
- enum hl_fw_component fwc);
int (*load_firmware_to_device)(struct hl_device *hdev);
int (*load_boot_fit_to_device)(struct hl_device *hdev);
u32 (*get_signal_cb_size)(struct hl_device *hdev);
@@ -1056,8 +1222,10 @@ struct hl_asic_funcs {
int (*hw_block_mmap)(struct hl_device *hdev, struct vm_area_struct *vma,
u32 block_id, u32 block_size);
void (*enable_events_from_fw)(struct hl_device *hdev);
- void (*get_msi_info)(u32 *table);
+ void (*get_msi_info)(__le32 *table);
int (*map_pll_idx_to_fw_idx)(u32 pll_idx);
+ void (*init_firmware_loader)(struct hl_device *hdev);
+ void (*init_cpu_scrambler_dram)(struct hl_device *hdev);
};
@@ -1262,6 +1430,7 @@ struct hl_userptr {
* @staged_sequence: the sequence of the staged submission this CS is part of,
* relevant only if staged_cs is set.
* @timeout_jiffies: cs timeout in jiffies.
+ * @submission_time_jiffies: submission time of the cs
* @type: CS_TYPE_*.
* @submitted: true if CS was submitted to H/W.
* @completed: true if CS was completed by device.
@@ -1274,6 +1443,8 @@ struct hl_userptr {
* @staged_first: true if this is the first staged CS and we need to receive
* timeout for this CS.
* @staged_cs: true if this CS is part of a staged submission.
+ * @skip_reset_on_timeout: true if we shall not reset the device in case
+ * timeout occurs (debug scenario).
*/
struct hl_cs {
u16 *jobs_in_queue_cnt;
@@ -1291,6 +1462,7 @@ struct hl_cs {
u64 sequence;
u64 staged_sequence;
u64 timeout_jiffies;
+ u64 submission_time_jiffies;
enum hl_cs_type type;
u8 submitted;
u8 completed;
@@ -1301,6 +1473,7 @@ struct hl_cs {
u8 staged_last;
u8 staged_first;
u8 staged_cs;
+ u8 skip_reset_on_timeout;
};
/**
@@ -1922,7 +2095,7 @@ struct hl_mmu_funcs {
* @kernel_queues: array of hl_hw_queue.
* @cs_mirror_list: CS mirror list for TDR.
* @cs_mirror_lock: protects cs_mirror_list.
- * @kernel_cb_mgr: command buffer manager for creating/destroying/handling CGs.
+ * @kernel_cb_mgr: command buffer manager for creating/destroying/handling CBs.
* @event_queue: event queue for IRQ from CPU-CP.
* @dma_pool: DMA pool for small allocations.
* @cpu_accessible_dma_mem: Host <-> CPU-CP shared memory CPU address.
@@ -1954,6 +2127,8 @@ struct hl_mmu_funcs {
* @aggregated_cs_counters: aggregated cs counters among all contexts
* @mmu_priv: device-specific MMU data.
* @mmu_func: device-related MMU functions.
+ * @fw_loader: FW loader manager.
+ * @pci_mem_region: array of memory regions in the PCI
* @dram_used_mem: current DRAM memory consumption.
* @timeout_jiffies: device CS timeout value.
* @max_power: the max power of the device, as configured by the sysadmin. This
@@ -1968,6 +2143,11 @@ struct hl_mmu_funcs {
* the error will be ignored by the driver during
* device initialization. Mainly used to debug and
* workaround firmware bugs
+ * @last_successful_open_jif: timestamp (jiffies) of the last successful
+ * device open.
+ * @last_open_session_duration_jif: duration (jiffies) of the last device open
+ * session.
+ * @open_counter: number of successful device open operations.
* @in_reset: is device in reset flow.
* @curr_pll_profile: current PLL profile.
* @card_type: Various ASICs have several card types. This indicates the card
@@ -2007,6 +2187,8 @@ struct hl_mmu_funcs {
* @collective_mon_idx: helper index for collective initialization
* @supports_coresight: is CoreSight supported.
* @supports_soft_reset: is soft reset supported.
+ * @allow_external_soft_reset: true if soft reset initiated by user or TDR is
+ * allowed.
* @supports_cb_mapping: is mapping a CB to the device's MMU supported.
* @needs_reset: true if reset_on_lockup is false and device should be reset
* due to lockup.
@@ -2015,6 +2197,14 @@ struct hl_mmu_funcs {
* @device_fini_pending: true if device_fini was called and might be
* waiting for the reset thread to finish
* @supports_staged_submission: true if staged submissions are supported
+ * @curr_reset_cause: saves an enumerated reset cause when a hard reset is
+ * triggered, and cleared after it is shared with preboot.
+ * @skip_reset_on_timeout: Skip device reset if CS has timed out, wait for it to
+ * complete instead.
+ * @device_cpu_is_halted: Flag to indicate whether the device CPU was already
+ * halted. We can't halt it again because the COMMS
+ * protocol will throw an error. Relevant only for
+ * cases where Linux was not loaded to device CPU
*/
struct hl_device {
struct pci_dev *pdev;
@@ -2079,11 +2269,18 @@ struct hl_device {
struct hl_mmu_priv mmu_priv;
struct hl_mmu_funcs mmu_func[MMU_NUM_PGT_LOCATIONS];
+ struct fw_load_mgr fw_loader;
+
+ struct pci_mem_region pci_mem_region[PCI_REGION_NUMBER];
+
atomic64_t dram_used_mem;
u64 timeout_jiffies;
u64 max_power;
u64 clock_gating_mask;
u64 boot_error_status_mask;
+ u64 last_successful_open_jif;
+ u64 last_open_session_duration_jif;
+ u64 open_counter;
atomic_t in_reset;
enum hl_pll_frequency curr_pll_profile;
enum cpucp_card_types card_type;
@@ -2116,11 +2313,15 @@ struct hl_device {
u8 collective_mon_idx;
u8 supports_coresight;
u8 supports_soft_reset;
+ u8 allow_external_soft_reset;
u8 supports_cb_mapping;
u8 needs_reset;
u8 process_kill_trial_cnt;
u8 device_fini_pending;
u8 supports_staged_submission;
+ u8 curr_reset_cause;
+ u8 skip_reset_on_timeout;
+ u8 device_cpu_is_halted;
/* Parameters for bring-up */
u64 nic_ports_mask;
@@ -2138,6 +2339,7 @@ struct hl_device {
u8 rl_enable;
u8 reset_on_preboot_fail;
u8 reset_upon_device_release;
+ u8 reset_if_device_not_idle;
};
@@ -2384,11 +2586,13 @@ void hl_fw_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size,
void *vaddr);
int hl_fw_send_heartbeat(struct hl_device *hdev);
int hl_fw_cpucp_info_get(struct hl_device *hdev,
- u32 cpu_security_boot_status_reg,
- u32 boot_err0_reg);
+ u32 sts_boot_dev_sts0_reg,
+ u32 sts_boot_dev_sts1_reg, u32 boot_err0_reg,
+ u32 boot_err1_reg);
int hl_fw_cpucp_handshake(struct hl_device *hdev,
- u32 cpu_security_boot_status_reg,
- u32 boot_err0_reg);
+ u32 sts_boot_dev_sts0_reg,
+ u32 sts_boot_dev_sts1_reg, u32 boot_err0_reg,
+ u32 boot_err1_reg);
int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size);
int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev,
struct hl_info_pci_counters *counters);
@@ -2399,14 +2603,17 @@ int get_used_pll_index(struct hl_device *hdev, u32 input_pll_index,
int hl_fw_cpucp_pll_info_get(struct hl_device *hdev, u32 pll_index,
u16 *pll_freq_arr);
int hl_fw_cpucp_power_get(struct hl_device *hdev, u64 *power);
-int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg,
- u32 msg_to_cpu_reg, u32 cpu_msg_status_reg,
- u32 cpu_security_boot_status_reg, u32 boot_err0_reg,
- bool skip_bmc, u32 cpu_timeout, u32 boot_fit_timeout);
+void hl_fw_ask_hard_reset_without_linux(struct hl_device *hdev);
+void hl_fw_ask_halt_machine_without_linux(struct hl_device *hdev);
+int hl_fw_init_cpu(struct hl_device *hdev);
int hl_fw_read_preboot_status(struct hl_device *hdev, u32 cpu_boot_status_reg,
- u32 cpu_security_boot_status_reg, u32 boot_err0_reg,
- u32 timeout);
-
+ u32 sts_boot_dev_sts0_reg,
+ u32 sts_boot_dev_sts1_reg, u32 boot_err0_reg,
+ u32 boot_err1_reg, u32 timeout);
+int hl_fw_dynamic_send_protocol_cmd(struct hl_device *hdev,
+ struct fw_load_mgr *fw_loader,
+ enum comms_cmd cmd, unsigned int size,
+ bool wait_ok, u32 timeout);
int hl_pci_bars_map(struct hl_device *hdev, const char * const name[3],
bool is_wc[3]);
int hl_pci_elbi_read(struct hl_device *hdev, u64 addr, u32 *data);
@@ -2415,6 +2622,7 @@ int hl_pci_set_inbound_region(struct hl_device *hdev, u8 region,
struct hl_inbound_pci_region *pci_region);
int hl_pci_set_outbound_region(struct hl_device *hdev,
struct hl_outbound_pci_region *pci_region);
+enum pci_region hl_get_pci_memory_region(struct hl_device *hdev, u64 addr);
int hl_pci_init(struct hl_device *hdev);
void hl_pci_fini(struct hl_device *hdev);
@@ -2443,6 +2651,8 @@ int hl_set_voltage(struct hl_device *hdev,
int hl_set_current(struct hl_device *hdev,
int sensor_index, u32 attr, long value);
void hl_release_pending_user_interrupts(struct hl_device *hdev);
+int hl_cs_signal_sob_wraparound_handler(struct hl_device *hdev, u32 q_idx,
+ struct hl_hw_sob **hw_sob, u32 count);
#ifdef CONFIG_DEBUG_FS