diff options
author | Yuri Nudelman <ynudelman@habana.ai> | 2021-06-13 09:22:20 +0300 |
---|---|---|
committer | Oded Gabbay <ogabbay@kernel.org> | 2021-06-18 15:23:43 +0300 |
commit | 4d041216c83dd9933c7c72b40511bb3585fa1724 (patch) | |
tree | aa10c6afd4278898d93d4b1283fce73beea3d45d | |
parent | 38e19d0b87ebc380341d5c026abed9e8060b2d37 (diff) |
debugfs: add skip_reset_on_timeout option
To be able to debug long-running CS better, without changing the
userspace code, we are adding a new option through debugfs interface
to skip the reset of the device in case of CS timeout.
Signed-off-by: Yuri Nudelman <ynudelman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
-rw-r--r-- | Documentation/ABI/testing/debugfs-driver-habanalabs | 8 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/command_submission.c | 1 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/debugfs.c | 5 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/habanalabs.h | 3 |
4 files changed, 17 insertions, 0 deletions
diff --git a/Documentation/ABI/testing/debugfs-driver-habanalabs b/Documentation/ABI/testing/debugfs-driver-habanalabs index c78fc9282876..e78ceb1f70b3 100644 --- a/Documentation/ABI/testing/debugfs-driver-habanalabs +++ b/Documentation/ABI/testing/debugfs-driver-habanalabs @@ -207,6 +207,14 @@ Contact: ogabbay@kernel.org Description: Sets the PCI power state. Valid values are "1" for D0 and "2" for D3Hot +What: /sys/kernel/debug/habanalabs/hl<n>/skip_reset_on_timeout +Date: Jun 2021 +KernelVersion: 5.13 +Contact: ynudelman@habana.ai +Description: Sets the skip reset on timeout option for the device. Value of + "0" means device will be reset in case some CS has timed out, + otherwise it will not be reset. + What: /sys/kernel/debug/habanalabs/hl<n>/stop_on_err Date: Mar 2020 KernelVersion: 5.6 diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c index 6d51f54030c1..adedb288d452 100644 --- a/drivers/misc/habanalabs/common/command_submission.c +++ b/drivers/misc/habanalabs/common/command_submission.c @@ -663,6 +663,7 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx, cs->timestamp = !!(flags & HL_CS_FLAGS_TIMESTAMP); cs->timeout_jiffies = timeout; cs->skip_reset_on_timeout = + hdev->skip_reset_on_timeout || !!(flags & HL_CS_FLAGS_SKIP_RESET_ON_TIMEOUT); cs->submission_time_jiffies = jiffies; INIT_LIST_HEAD(&cs->job_list); diff --git a/drivers/misc/habanalabs/common/debugfs.c b/drivers/misc/habanalabs/common/debugfs.c index 8381155578a0..703d79fb6f3f 100644 --- a/drivers/misc/habanalabs/common/debugfs.c +++ b/drivers/misc/habanalabs/common/debugfs.c @@ -1278,6 +1278,11 @@ void hl_debugfs_add_device(struct hl_device *hdev) dev_entry->root, &dev_entry->blob_desc); + debugfs_create_x8("skip_reset_on_timeout", + 0644, + dev_entry->root, + &hdev->skip_reset_on_timeout); + for (i = 0, entry = dev_entry->entry_arr ; i < count ; i++, entry++) { debugfs_create_file(hl_debugfs_list[i].name, 0444, diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index b4413c398142..09b89fdeba0b 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -2191,6 +2191,8 @@ struct hl_mmu_funcs { * @supports_staged_submission: true if staged submissions are supported * @curr_reset_cause: saves an enumerated reset cause when a hard reset is * triggered, and cleared after it is shared with preboot. + * @skip_reset_on_timeout: Skip device reset if CS has timed out, wait for it to + * complete instead. */ struct hl_device { struct pci_dev *pdev; @@ -2305,6 +2307,7 @@ struct hl_device { u8 device_fini_pending; u8 supports_staged_submission; u8 curr_reset_cause; + u8 skip_reset_on_timeout; /* Parameters for bring-up */ u64 nic_ports_mask; |