habanalabs: all FD must be closed before removing device

This patch fixes a bug in the implementation of the function that removes the device. The bug can happen when the device is removed but not the driver itself (e.g. remove by the OS due to PCI freeze in Power architecture). In that case, there maybe open users that are calling IOCTLs while the device is removed. This is a possible race condition that the driver must handle. Otherwise, a kernel panic may occur. This race is prevented in the hard-reset flow, because the driver makes sure the users are closed before continuing with the hard-reset. This race can not occur when the driver itself is removed because the OS makes sure all the file descriptors are closed. The fix is to make sure the open users close their file descriptors and if they don't (after a certain amount of time), the driver sends them a SIGKILL, because the remove of the device can't be stopped. The patch re-uses the same code that is called from the hard-reset flow. Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
author: Oded Gabbay <oded.gabbay@gmail.com> 2019-04-06 13:23:54 +0300
committer: Oded Gabbay <oded.gabbay@gmail.com> 2019-04-06 13:23:54 +0300
commit: caa3c8e52582fc4d2ed82afd5e7ea164c18ef4fe (patch)
tree: 93d4ee86bbb2438d64c813484c327aab2034938b
parent: 54303a1aef95b0cbd6a04c3b729c93da7a58e0f7 (diff)
1 files changed, 27 insertions, 5 deletions
diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c
index 6cbfd560721e..56d3ba53c661 100644
--- a/drivers/misc/habanalabs/device.c
+++ b/drivers/misc/habanalabs/device.c
@@ -513,11 +513,8 @@ disable_device:
 	return rc;
 }
 
-static void hl_device_hard_reset_pending(struct work_struct *work)
+static void device_kill_open_processes(struct hl_device *hdev)
 {
-	struct hl_device_reset_work *device_reset_work =
-		container_of(work, struct hl_device_reset_work, reset_work);
-	struct hl_device *hdev = device_reset_work->hdev;
 	u16 pending_total, pending_cnt;
 	struct task_struct *task = NULL;
 
@@ -552,6 +549,12 @@ static void hl_device_hard_reset_pending(struct work_struct *work)
 		}
 	}
 
+	/* We killed the open users, but because the driver cleans up after the
+	 * user contexts are closed (e.g. mmu mappings), we need to wait again
+	 * to make sure the cleaning phase is finished before continuing with
+	 * the reset
+	 */
+
 	pending_cnt = pending_total;
 
 	while ((atomic_read(&hdev->fd_open_cnt)) && (pending_cnt)) {
@@ -567,6 +570,16 @@ static void hl_device_hard_reset_pending(struct work_struct *work)
 
 	mutex_unlock(&hdev->fd_open_cnt_lock);
 
+}
+
+static void device_hard_reset_pending(struct work_struct *work)
+{
+	struct hl_device_reset_work *device_reset_work =
+		container_of(work, struct hl_device_reset_work, reset_work);
+	struct hl_device *hdev = device_reset_work->hdev;
+
+	device_kill_open_processes(hdev);
+
 	hl_device_reset(hdev, true, true);
 
 	kfree(device_reset_work);
@@ -650,7 +663,7 @@ again:
 		 * from a dedicated work
 		 */
 		INIT_WORK(&device_reset_work->reset_work,
-				hl_device_hard_reset_pending);
+				device_hard_reset_pending);
 		device_reset_work->hdev = hdev;
 		schedule_work(&device_reset_work->reset_work);
 
@@ -1049,6 +1062,15 @@ void hl_device_fini(struct hl_device *hdev)
 	/* Mark device as disabled */
 	hdev->disabled = true;
 
+	/*
+	 * Flush anyone that is inside the critical section of enqueue
+	 * jobs to the H/W
+	 */
+	hdev->asic_funcs->hw_queues_lock(hdev);
+	hdev->asic_funcs->hw_queues_unlock(hdev);
+
+	device_kill_open_processes(hdev);
+
 	hl_hwmon_fini(hdev);
 
 	device_late_fini(hdev);
author	Oded Gabbay <oded.gabbay@gmail.com>	2019-04-06 13:23:54 +0300
committer	Oded Gabbay <oded.gabbay@gmail.com>	2019-04-06 13:23:54 +0300
commit	caa3c8e52582fc4d2ed82afd5e7ea164c18ef4fe (patch)
tree	93d4ee86bbb2438d64c813484c327aab2034938b
parent	54303a1aef95b0cbd6a04c3b729c93da7a58e0f7 (diff)