summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYuri Nudelman <ynudelman@habana.ai>2021-06-13 09:22:20 +0300
committerOded Gabbay <ogabbay@kernel.org>2021-06-18 15:23:43 +0300
commit4d041216c83dd9933c7c72b40511bb3585fa1724 (patch)
treeaa10c6afd4278898d93d4b1283fce73beea3d45d
parent38e19d0b87ebc380341d5c026abed9e8060b2d37 (diff)
debugfs: add skip_reset_on_timeout option
To be able to debug long-running CS better, without changing the userspace code, we are adding a new option through debugfs interface to skip the reset of the device in case of CS timeout. Signed-off-by: Yuri Nudelman <ynudelman@habana.ai> Reviewed-by: Oded Gabbay <ogabbay@kernel.org> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
-rw-r--r--Documentation/ABI/testing/debugfs-driver-habanalabs8
-rw-r--r--drivers/misc/habanalabs/common/command_submission.c1
-rw-r--r--drivers/misc/habanalabs/common/debugfs.c5
-rw-r--r--drivers/misc/habanalabs/common/habanalabs.h3
4 files changed, 17 insertions, 0 deletions
diff --git a/Documentation/ABI/testing/debugfs-driver-habanalabs b/Documentation/ABI/testing/debugfs-driver-habanalabs
index c78fc9282876..e78ceb1f70b3 100644
--- a/Documentation/ABI/testing/debugfs-driver-habanalabs
+++ b/Documentation/ABI/testing/debugfs-driver-habanalabs
@@ -207,6 +207,14 @@ Contact: ogabbay@kernel.org
Description: Sets the PCI power state. Valid values are "1" for D0 and "2"
for D3Hot
+What: /sys/kernel/debug/habanalabs/hl<n>/skip_reset_on_timeout
+Date: Jun 2021
+KernelVersion: 5.13
+Contact: ynudelman@habana.ai
+Description: Sets the skip reset on timeout option for the device. Value of
+ "0" means device will be reset in case some CS has timed out,
+ otherwise it will not be reset.
+
What: /sys/kernel/debug/habanalabs/hl<n>/stop_on_err
Date: Mar 2020
KernelVersion: 5.6
diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index 6d51f54030c1..adedb288d452 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -663,6 +663,7 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
cs->timestamp = !!(flags & HL_CS_FLAGS_TIMESTAMP);
cs->timeout_jiffies = timeout;
cs->skip_reset_on_timeout =
+ hdev->skip_reset_on_timeout ||
!!(flags & HL_CS_FLAGS_SKIP_RESET_ON_TIMEOUT);
cs->submission_time_jiffies = jiffies;
INIT_LIST_HEAD(&cs->job_list);
diff --git a/drivers/misc/habanalabs/common/debugfs.c b/drivers/misc/habanalabs/common/debugfs.c
index 8381155578a0..703d79fb6f3f 100644
--- a/drivers/misc/habanalabs/common/debugfs.c
+++ b/drivers/misc/habanalabs/common/debugfs.c
@@ -1278,6 +1278,11 @@ void hl_debugfs_add_device(struct hl_device *hdev)
dev_entry->root,
&dev_entry->blob_desc);
+ debugfs_create_x8("skip_reset_on_timeout",
+ 0644,
+ dev_entry->root,
+ &hdev->skip_reset_on_timeout);
+
for (i = 0, entry = dev_entry->entry_arr ; i < count ; i++, entry++) {
debugfs_create_file(hl_debugfs_list[i].name,
0444,
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index b4413c398142..09b89fdeba0b 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -2191,6 +2191,8 @@ struct hl_mmu_funcs {
* @supports_staged_submission: true if staged submissions are supported
* @curr_reset_cause: saves an enumerated reset cause when a hard reset is
* triggered, and cleared after it is shared with preboot.
+ * @skip_reset_on_timeout: Skip device reset if CS has timed out, wait for it to
+ * complete instead.
*/
struct hl_device {
struct pci_dev *pdev;
@@ -2305,6 +2307,7 @@ struct hl_device {
u8 device_fini_pending;
u8 supports_staged_submission;
u8 curr_reset_cause;
+ u8 skip_reset_on_timeout;
/* Parameters for bring-up */
u64 nic_ports_mask;