diff options
-rw-r--r-- | drivers/misc/habanalabs/device.c | 6 | ||||
-rw-r--r-- | drivers/misc/habanalabs/gaudi/gaudi.c | 38 | ||||
-rw-r--r-- | drivers/misc/habanalabs/goya/goya.c | 1 | ||||
-rw-r--r-- | drivers/misc/habanalabs/habanalabs.h | 2 | ||||
-rw-r--r-- | drivers/misc/habanalabs/sysfs.c | 5 |
5 files changed, 35 insertions, 17 deletions
diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c index 4b6c8de46dd8..4a4a446f479e 100644 --- a/drivers/misc/habanalabs/device.c +++ b/drivers/misc/habanalabs/device.c @@ -801,6 +801,7 @@ static void device_hard_reset_pending(struct work_struct *work) * @hdev: pointer to habanalabs device structure * @hard_reset: should we do hard reset to all engines or just reset the * compute/dma engines + * @from_hard_reset_thread: is the caller the hard-reset thread * * Block future CS and wait for pending CS to be enqueued * Call ASIC H/W fini @@ -823,6 +824,11 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset, return 0; } + if ((!hard_reset) && (!hdev->supports_soft_reset)) { + dev_dbg(hdev->dev, "Doing hard-reset instead of soft-reset\n"); + hard_reset = true; + } + /* * Prevent concurrency in this function - only one reset should be * done at any given time. Only need to perform this if we didn't diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 3d4a569914d3..92a5130f06fb 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -5774,7 +5774,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, u16 event_type = ((ctl & EQ_CTL_EVENT_TYPE_MASK) >> EQ_CTL_EVENT_TYPE_SHIFT); u8 cause; - bool soft_reset_required; + bool reset_required; gaudi->events_stat[event_type]++; gaudi->events_stat_aggregate[event_type]++; @@ -5840,16 +5840,18 @@ static void gaudi_handle_eqe(struct hl_device *hdev, case GAUDI_EVENT_TPC6_DEC: case GAUDI_EVENT_TPC7_DEC: gaudi_print_irq_info(hdev, event_type, true); - soft_reset_required = gaudi_tpc_read_interrupts(hdev, + reset_required = gaudi_tpc_read_interrupts(hdev, tpc_dec_event_to_tpc_id(event_type), "AXI_SLV_DEC_Error"); - if (soft_reset_required) { - dev_err_ratelimited(hdev->dev, - "soft reset required due to %s\n", - gaudi_irq_map_table[event_type].name); - hl_device_reset(hdev, false, false); + if (reset_required) { + dev_err(hdev->dev, "hard reset required due to %s\n", + gaudi_irq_map_table[event_type].name); + + if (hdev->hard_reset_on_fw_events) + hl_device_reset(hdev, true, false); + } else { + hl_fw_unmask_irq(hdev, event_type); } - hl_fw_unmask_irq(hdev, event_type); break; case GAUDI_EVENT_TPC0_KRN_ERR: @@ -5861,16 +5863,18 @@ static void gaudi_handle_eqe(struct hl_device *hdev, case GAUDI_EVENT_TPC6_KRN_ERR: case GAUDI_EVENT_TPC7_KRN_ERR: gaudi_print_irq_info(hdev, event_type, true); - soft_reset_required = gaudi_tpc_read_interrupts(hdev, + reset_required = gaudi_tpc_read_interrupts(hdev, tpc_krn_event_to_tpc_id(event_type), "KRN_ERR"); - if (soft_reset_required) { - dev_err_ratelimited(hdev->dev, - "soft reset required due to %s\n", - gaudi_irq_map_table[event_type].name); - hl_device_reset(hdev, false, false); + if (reset_required) { + dev_err(hdev->dev, "hard reset required due to %s\n", + gaudi_irq_map_table[event_type].name); + + if (hdev->hard_reset_on_fw_events) + hl_device_reset(hdev, true, false); + } else { + hl_fw_unmask_irq(hdev, event_type); } - hl_fw_unmask_irq(hdev, event_type); break; case GAUDI_EVENT_PCIE_CORE_SERR: @@ -5921,8 +5925,8 @@ static void gaudi_handle_eqe(struct hl_device *hdev, case GAUDI_EVENT_RAZWI_OR_ADC_SW: gaudi_print_irq_info(hdev, event_type, true); - hl_device_reset(hdev, false, false); - hl_fw_unmask_irq(hdev, event_type); + if (hdev->hard_reset_on_fw_events) + hl_device_reset(hdev, true, false); break; case GAUDI_EVENT_TPC0_BMON_SPMU: diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 15b6c3228e37..152418dfe20c 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -752,6 +752,7 @@ static int goya_sw_init(struct hl_device *hdev) spin_lock_init(&goya->hw_queues_lock); hdev->supports_coresight = true; + hdev->supports_soft_reset = true; return 0; diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index 5a855b7edf43..0f0691875298 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -1436,6 +1436,7 @@ struct hl_device_idle_busy_ts { * @stop_on_err: true if engines should stop on error. * @supports_sync_stream: is sync stream supported. * @supports_coresight: is CoreSight supported. + * @supports_soft_reset: is soft reset supported. */ struct hl_device { struct pci_dev *pdev; @@ -1522,6 +1523,7 @@ struct hl_device { u8 stop_on_err; u8 supports_sync_stream; u8 supports_coresight; + u8 supports_soft_reset; /* Parameters for bring-up */ u8 mmu_enable; diff --git a/drivers/misc/habanalabs/sysfs.c b/drivers/misc/habanalabs/sysfs.c index e4454414d0e1..5d78d5e1c782 100644 --- a/drivers/misc/habanalabs/sysfs.c +++ b/drivers/misc/habanalabs/sysfs.c @@ -183,6 +183,11 @@ static ssize_t soft_reset_store(struct device *dev, goto out; } + if (!hdev->supports_soft_reset) { + dev_err(hdev->dev, "Device does not support soft-reset\n"); + goto out; + } + dev_warn(hdev->dev, "Soft-Reset requested through sysfs\n"); hl_device_reset(hdev, false, false); |