11 files changed, 2843 insertions, 7 deletions
diff --git a/drivers/misc/habanalabs/Makefile b/drivers/misc/habanalabs/Makefile
index b5607233d216..d2fd0e18b1eb 100644
--- a/drivers/misc/habanalabs/Makefile
+++ b/drivers/misc/habanalabs/Makefile
@@ -5,7 +5,8 @@
 obj-m	:= habanalabs.o
 
 habanalabs-y := habanalabs_drv.o device.o context.o asid.o habanalabs_ioctl.o \
-		command_buffer.o hw_queue.o irq.o sysfs.o hwmon.o
+		command_buffer.o hw_queue.o irq.o sysfs.o hwmon.o memory.o \
+		command_submission.o
 
 include $(src)/goya/Makefile
 habanalabs-y += $(HL_GOYA_FILES)
diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c
new file mode 100644
index 000000000000..ae68b97e428d
--- /dev/null
+++ b/drivers/misc/habanalabs/command_submission.c
@@ -0,0 +1,766 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright 2016-2019 HabanaLabs, Ltd.
+ * All Rights Reserved.
+ */
+
+#include <uapi/misc/habanalabs.h>
+#include "habanalabs.h"
+
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+
+static void job_wq_completion(struct work_struct *work);
+static long _hl_cs_wait_ioctl(struct hl_device *hdev,
+		struct hl_ctx *ctx, u64 timeout_us, u64 seq);
+static void cs_do_release(struct kref *ref);
+
+static const char *hl_fence_get_driver_name(struct dma_fence *fence)
+{
+	return "HabanaLabs";
+}
+
+static const char *hl_fence_get_timeline_name(struct dma_fence *fence)
+{
+	struct hl_dma_fence *hl_fence =
+		container_of(fence, struct hl_dma_fence, base_fence);
+
+	return dev_name(hl_fence->hdev->dev);
+}
+
+static bool hl_fence_enable_signaling(struct dma_fence *fence)
+{
+	return true;
+}
+
+static void hl_fence_release(struct dma_fence *fence)
+{
+	struct hl_dma_fence *hl_fence =
+		container_of(fence, struct hl_dma_fence, base_fence);
+
+	kfree_rcu(hl_fence, base_fence.rcu);
+}
+
+static const struct dma_fence_ops hl_fence_ops = {
+	.get_driver_name = hl_fence_get_driver_name,
+	.get_timeline_name = hl_fence_get_timeline_name,
+	.enable_signaling = hl_fence_enable_signaling,
+	.wait = dma_fence_default_wait,
+	.release = hl_fence_release
+};
+
+static void cs_get(struct hl_cs *cs)
+{
+	kref_get(&cs->refcount);
+}
+
+static int cs_get_unless_zero(struct hl_cs *cs)
+{
+	return kref_get_unless_zero(&cs->refcount);
+}
+
+static void cs_put(struct hl_cs *cs)
+{
+	kref_put(&cs->refcount, cs_do_release);
+}
+
+/*
+ * cs_parser - parse the user command submission
+ *
+ * @hpriv	: pointer to the private data of the fd
+ * @job        : pointer to the job that holds the command submission info
+ *
+ * The function parses the command submission of the user. It calls the
+ * ASIC specific parser, which returns a list of memory blocks to send
+ * to the device as different command buffers
+ *
+ */
+static int cs_parser(struct hl_fpriv *hpriv, struct hl_cs_job *job)
+{
+	struct hl_device *hdev = hpriv->hdev;
+	struct hl_cs_parser parser;
+	int rc;
+
+	parser.ctx_id = job->cs->ctx->asid;
+	parser.cs_sequence = job->cs->sequence;
+	parser.job_id = job->id;
+
+	parser.hw_queue_id = job->hw_queue_id;
+	parser.job_userptr_list = &job->userptr_list;
+	parser.patched_cb = NULL;
+	parser.user_cb = job->user_cb;
+	parser.user_cb_size = job->user_cb_size;
+	parser.ext_queue = job->ext_queue;
+	job->patched_cb = NULL;
+	parser.use_virt_addr = hdev->mmu_enable;
+
+	rc = hdev->asic_funcs->cs_parser(hdev, &parser);
+	if (job->ext_queue) {
+		if (!rc) {
+			job->patched_cb = parser.patched_cb;
+			job->job_cb_size = parser.patched_cb_size;
+
+			spin_lock(&job->patched_cb->lock);
+			job->patched_cb->cs_cnt++;
+			spin_unlock(&job->patched_cb->lock);
+		}
+
+		/*
+		 * Whether the parsing worked or not, we don't need the
+		 * original CB anymore because it was already parsed and
+		 * won't be accessed again for this CS
+		 */
+		spin_lock(&job->user_cb->lock);
+		job->user_cb->cs_cnt--;
+		spin_unlock(&job->user_cb->lock);
+		hl_cb_put(job->user_cb);
+		job->user_cb = NULL;
+	}
+
+	return rc;
+}
+
+static void free_job(struct hl_device *hdev, struct hl_cs_job *job)
+{
+	struct hl_cs *cs = job->cs;
+
+	if (job->ext_queue) {
+		hl_userptr_delete_list(hdev, &job->userptr_list);
+
+		/*
+		 * We might arrive here from rollback and patched CB wasn't
+		 * created, so we need to check it's not NULL
+		 */
+		if (job->patched_cb) {
+			spin_lock(&job->patched_cb->lock);
+			job->patched_cb->cs_cnt--;
+			spin_unlock(&job->patched_cb->lock);
+
+			hl_cb_put(job->patched_cb);
+		}
+	}
+
+	/*
+	 * This is the only place where there can be multiple threads
+	 * modifying the list at the same time
+	 */
+	spin_lock(&cs->job_lock);
+	list_del(&job->cs_node);
+	spin_unlock(&cs->job_lock);
+
+	if (job->ext_queue)
+		cs_put(cs);
+
+	kfree(job);
+}
+
+static void cs_do_release(struct kref *ref)
+{
+	struct hl_cs *cs = container_of(ref, struct hl_cs,
+						refcount);
+	struct hl_device *hdev = cs->ctx->hdev;
+	struct hl_cs_job *job, *tmp;
+
+	cs->completed = true;
+
+	/*
+	 * Although if we reached here it means that all external jobs have
+	 * finished, because each one of them took refcnt to CS, we still
+	 * need to go over the internal jobs and free them. Otherwise, we
+	 * will have leaked memory and what's worse, the CS object (and
+	 * potentially the CTX object) could be released, while the JOB
+	 * still holds a pointer to them (but no reference).
+	 */
+	list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node)
+		free_job(hdev, job);
+
+	/* We also need to update CI for internal queues */
+	if (cs->submitted) {
+		hl_int_hw_queue_update_ci(cs);
+
+		spin_lock(&hdev->hw_queues_mirror_lock);
+		/* remove CS from hw_queues mirror list */
+		list_del_init(&cs->mirror_node);
+		spin_unlock(&hdev->hw_queues_mirror_lock);
+
+		/*
+		 * Don't cancel TDR in case this CS was timedout because we
+		 * might be running from the TDR context
+		 */
+		if ((!cs->timedout) &&
+			(hdev->timeout_jiffies != MAX_SCHEDULE_TIMEOUT)) {
+			struct hl_cs *next;
+
+			if (cs->tdr_active)
+				cancel_delayed_work_sync(&cs->work_tdr);
+
+			spin_lock(&hdev->hw_queues_mirror_lock);
+
+			/* queue TDR for next CS */
+			next = list_first_entry_or_null(
+					&hdev->hw_queues_mirror_list,
+					struct hl_cs, mirror_node);
+
+			if ((next) && (!next->tdr_active)) {
+				next->tdr_active = true;
+				schedule_delayed_work(&next->work_tdr,
+							hdev->timeout_jiffies);
+			}
+
+			spin_unlock(&hdev->hw_queues_mirror_lock);
+		}
+	}
+
+	hl_ctx_put(cs->ctx);
+
+	if (cs->timedout)
+		dma_fence_set_error(cs->fence, -ETIMEDOUT);
+	else if (cs->aborted)
+		dma_fence_set_error(cs->fence, -EIO);
+
+	dma_fence_signal(cs->fence);
+	dma_fence_put(cs->fence);
+
+	kfree(cs);
+}
+
+static void cs_timedout(struct work_struct *work)
+{
+	struct hl_device *hdev;
+	int ctx_asid, rc;
+	struct hl_cs *cs = container_of(work, struct hl_cs,
+						 work_tdr.work);
+	rc = cs_get_unless_zero(cs);
+	if (!rc)
+		return;
+
+	if ((!cs->submitted) || (cs->completed)) {
+		cs_put(cs);
+		return;
+	}
+
+	/* Mark the CS is timed out so we won't try to cancel its TDR */
+	cs->timedout = true;
+
+	hdev = cs->ctx->hdev;
+	ctx_asid = cs->ctx->asid;
+
+	/* TODO: add information about last signaled seq and last emitted seq */
+	dev_err(hdev->dev, "CS %d.%llu got stuck!\n", ctx_asid, cs->sequence);
+
+	cs_put(cs);
+
+	if (hdev->reset_on_lockup)
+		hl_device_reset(hdev, false, false);
+}
+
+static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
+			struct hl_cs **cs_new)
+{
+	struct hl_dma_fence *fence;
+	struct dma_fence *other = NULL;
+	struct hl_cs *cs;
+	int rc;
+
+	cs = kzalloc(sizeof(*cs), GFP_ATOMIC);
+	if (!cs)
+		return -ENOMEM;
+
+	cs->ctx = ctx;
+	cs->submitted = false;
+	cs->completed = false;
+	INIT_LIST_HEAD(&cs->job_list);
+	INIT_DELAYED_WORK(&cs->work_tdr, cs_timedout);
+	kref_init(&cs->refcount);
+	spin_lock_init(&cs->job_lock);
+
+	fence = kmalloc(sizeof(*fence), GFP_ATOMIC);
+	if (!fence) {
+		rc = -ENOMEM;
+		goto free_cs;
+	}
+
+	fence->hdev = hdev;
+	spin_lock_init(&fence->lock);
+	cs->fence = &fence->base_fence;
+
+	spin_lock(&ctx->cs_lock);
+
+	fence->cs_seq = ctx->cs_sequence;
+	other = ctx->cs_pending[fence->cs_seq & (HL_MAX_PENDING_CS - 1)];
+	if ((other) && (!dma_fence_is_signaled(other))) {
+		spin_unlock(&ctx->cs_lock);
+		rc = -EAGAIN;
+		goto free_fence;
+	}
+
+	dma_fence_init(&fence->base_fence, &hl_fence_ops, &fence->lock,
+			ctx->asid, ctx->cs_sequence);
+
+	cs->sequence = fence->cs_seq;
+
+	ctx->cs_pending[fence->cs_seq & (HL_MAX_PENDING_CS - 1)] =
+							&fence->base_fence;
+	ctx->cs_sequence++;
+
+	dma_fence_get(&fence->base_fence);
+
+	dma_fence_put(other);
+
+	spin_unlock(&ctx->cs_lock);
+
+	*cs_new = cs;
+
+	return 0;
+
+free_fence:
+	kfree(fence);
+free_cs:
+	kfree(cs);
+	return rc;
+}
+
+static void cs_rollback(struct hl_device *hdev, struct hl_cs *cs)
+{
+	struct hl_cs_job *job, *tmp;
+
+	list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node)
+		free_job(hdev, job);
+}
+
+void hl_cs_rollback_all(struct hl_device *hdev)
+{
+	struct hl_cs *cs, *tmp;
+
+	/* flush all completions */
+	flush_workqueue(hdev->cq_wq);
+
+	/* Make sure we don't have leftovers in the H/W queues mirror list */
+	list_for_each_entry_safe(cs, tmp, &hdev->hw_queues_mirror_list,
+				mirror_node) {
+		cs_get(cs);
+		cs->aborted = true;
+		dev_warn_ratelimited(hdev->dev, "Killing CS %d.%llu\n",
+					cs->ctx->asid, cs->sequence);
+		cs_rollback(hdev, cs);
+		cs_put(cs);
+	}
+}
+
+static void job_wq_completion(struct work_struct *work)
+{
+	struct hl_cs_job *job = container_of(work, struct hl_cs_job,
+						finish_work);
+	struct hl_cs *cs = job->cs;
+	struct hl_device *hdev = cs->ctx->hdev;
+
+	/* job is no longer needed */
+	free_job(hdev, job);
+}
+
+static struct hl_cb *validate_queue_index(struct hl_device *hdev,
+					struct hl_cb_mgr *cb_mgr,
+					struct hl_cs_chunk *chunk,
+					bool *ext_queue)
+{
+	struct asic_fixed_properties *asic = &hdev->asic_prop;
+	struct hw_queue_properties *hw_queue_prop;
+	u32 cb_handle;
+	struct hl_cb *cb;
+
+	/* Assume external queue */
+	*ext_queue = true;
+
+	hw_queue_prop = &asic->hw_queues_props[chunk->queue_index];
+
+	if ((chunk->queue_index >= HL_MAX_QUEUES) ||
+			(hw_queue_prop->type == QUEUE_TYPE_NA)) {
+		dev_err(hdev->dev, "Queue index %d is invalid\n",
+			chunk->queue_index);
+		return NULL;
+	}
+
+	if (hw_queue_prop->kmd_only) {
+		dev_err(hdev->dev, "Queue index %d is restricted for KMD\n",
+			chunk->queue_index);
+		return NULL;
+	} else if (hw_queue_prop->type == QUEUE_TYPE_INT) {
+		*ext_queue = false;
+		return (struct hl_cb *) (uintptr_t) chunk->cb_handle;
+	}
+
+	/* Retrieve CB object */
+	cb_handle = (u32) (chunk->cb_handle >> PAGE_SHIFT);
+
+	cb = hl_cb_get(hdev, cb_mgr, cb_handle);
+	if (!cb) {
+		dev_err(hdev->dev, "CB handle 0x%x invalid\n", cb_handle);
+		return NULL;
+	}
+
+	if ((chunk->cb_size < 8) || (chunk->cb_size > cb->size)) {
+		dev_err(hdev->dev, "CB size %u invalid\n", chunk->cb_size);
+		goto release_cb;
+	}
+
+	spin_lock(&cb->lock);
+	cb->cs_cnt++;
+	spin_unlock(&cb->lock);
+
+	return cb;
+
+release_cb:
+	hl_cb_put(cb);
+	return NULL;
+}
+
+struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev, bool ext_queue)
+{
+	struct hl_cs_job *job;
+
+	job = kzalloc(sizeof(*job), GFP_ATOMIC);
+	if (!job)
+		return NULL;
+
+	job->ext_queue = ext_queue;
+
+	if (job->ext_queue) {
+		INIT_LIST_HEAD(&job->userptr_list);
+		INIT_WORK(&job->finish_work, job_wq_completion);
+	}
+
+	return job;
+}
+
+static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
+			u32 num_chunks, u64 *cs_seq)
+{
+	struct hl_device *hdev = hpriv->hdev;
+	struct hl_cs_chunk *cs_chunk_array;
+	struct hl_cs_job *job;
+	struct hl_cs *cs;
+	struct hl_cb *cb;
+	bool ext_queue_present = false;
+	u32 size_to_copy;
+	int rc, i, parse_cnt;
+
+	*cs_seq = ULLONG_MAX;
+
+	if (num_chunks > HL_MAX_JOBS_PER_CS) {
+		dev_err(hdev->dev,
+			"Number of chunks can NOT be larger than %d\n",
+			HL_MAX_JOBS_PER_CS);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	cs_chunk_array = kmalloc_array(num_chunks, sizeof(*cs_chunk_array),
+					GFP_ATOMIC);
+	if (!cs_chunk_array) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	size_to_copy = num_chunks * sizeof(struct hl_cs_chunk);
+	if (copy_from_user(cs_chunk_array, chunks, size_to_copy)) {
+		dev_err(hdev->dev, "Failed to copy cs chunk array from user\n");
+		rc = -EFAULT;
+		goto free_cs_chunk_array;
+	}
+
+	/* increment refcnt for context */
+	hl_ctx_get(hdev, hpriv->ctx);
+
+	rc = allocate_cs(hdev, hpriv->ctx, &cs);
+	if (rc) {
+		hl_ctx_put(hpriv->ctx);
+		goto free_cs_chunk_array;
+	}
+
+	*cs_seq = cs->sequence;
+
+	/* Validate ALL the CS chunks before submitting the CS */
+	for (i = 0, parse_cnt = 0 ; i < num_chunks ; i++, parse_cnt++) {
+		struct hl_cs_chunk *chunk = &cs_chunk_array[i];
+		bool ext_queue;
+
+		cb = validate_queue_index(hdev, &hpriv->cb_mgr, chunk,
+					&ext_queue);
+		if (ext_queue) {
+			ext_queue_present = true;
+			if (!cb) {
+				rc = -EINVAL;
+				goto free_cs_object;
+			}
+		}
+
+		job = hl_cs_allocate_job(hdev, ext_queue);
+		if (!job) {
+			dev_err(hdev->dev, "Failed to allocate a new job\n");
+			rc = -ENOMEM;
+			if (ext_queue)
+				goto release_cb;
+			else
+				goto free_cs_object;
+		}
+
+		job->id = i + 1;
+		job->cs = cs;
+		job->user_cb = cb;
+		job->user_cb_size = chunk->cb_size;
+		if (job->ext_queue)
+			job->job_cb_size = cb->size;
+		else
+			job->job_cb_size = chunk->cb_size;
+		job->hw_queue_id = chunk->queue_index;
+
+		cs->jobs_in_queue_cnt[job->hw_queue_id]++;
+
+		list_add_tail(&job->cs_node, &cs->job_list);
+
+		/*
+		 * Increment CS reference. When CS reference is 0, CS is
+		 * done and can be signaled to user and free all its resources
+		 * Only increment for JOB on external queues, because only
+		 * for those JOBs we get completion
+		 */
+		if (job->ext_queue)
+			cs_get(cs);
+
+		rc = cs_parser(hpriv, job);
+		if (rc) {
+			dev_err(hdev->dev,
+				"Failed to parse JOB %d.%llu.%d, err %d, rejecting the CS\n",
+				cs->ctx->asid, cs->sequence, job->id, rc);
+			goto free_cs_object;
+		}
+	}
+
+	if (!ext_queue_present) {
+		dev_err(hdev->dev,
+			"Reject CS %d.%llu because no external queues jobs\n",
+			cs->ctx->asid, cs->sequence);
+		rc = -EINVAL;
+		goto free_cs_object;
+	}
+
+	rc = hl_hw_queue_schedule_cs(cs);
+	if (rc) {
+		dev_err(hdev->dev,
+			"Failed to submit CS %d.%llu to H/W queues, error %d\n",
+			cs->ctx->asid, cs->sequence, rc);
+		goto free_cs_object;
+	}
+
+	rc = HL_CS_STATUS_SUCCESS;
+	goto put_cs;
+
+release_cb:
+	spin_lock(&cb->lock);
+	cb->cs_cnt--;
+	spin_unlock(&cb->lock);
+	hl_cb_put(cb);
+free_cs_object:
+	cs_rollback(hdev, cs);
+	*cs_seq = ULLONG_MAX;
+	/* The path below is both for good and erroneous exits */
+put_cs:
+	/* We finished with the CS in this function, so put the ref */
+	cs_put(cs);
+free_cs_chunk_array:
+	kfree(cs_chunk_array);
+out:
+	return rc;
+}
+
+int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
+{
+	struct hl_device *hdev = hpriv->hdev;
+	union hl_cs_args *args = data;
+	struct hl_ctx *ctx = hpriv->ctx;
+	void __user *chunks;
+	u32 num_chunks;
+	u64 cs_seq = ULONG_MAX;
+	int rc, do_restore;
+	bool need_soft_reset = false;
+
+	if (hl_device_disabled_or_in_reset(hdev)) {
+		dev_warn(hdev->dev,
+			"Device is %s. Can't submit new CS\n",
+			atomic_read(&hdev->in_reset) ? "in_reset" : "disabled");
+		rc = -EBUSY;
+		goto out;
+	}
+
+	do_restore = atomic_cmpxchg(&ctx->thread_restore_token, 1, 0);
+
+	if (do_restore || (args->in.cs_flags & HL_CS_FLAGS_FORCE_RESTORE)) {
+		long ret;
+
+		chunks = (void __user *)(uintptr_t)args->in.chunks_restore;
+		num_chunks = args->in.num_chunks_restore;
+
+		mutex_lock(&hpriv->restore_phase_mutex);
+
+		if (do_restore) {
+			rc = hdev->asic_funcs->context_switch(hdev, ctx->asid);
+			if (rc) {
+				dev_err_ratelimited(hdev->dev,
+					"Failed to switch to context %d, rejecting CS! %d\n",
+					ctx->asid, rc);
+				/*
+				 * If we timedout, we need to soft-reset because
+				 * QMAN is probably stuck. However, we can't
+				 * call to reset here directly because of
+				 * deadlock, so need to do it at the very end
+				 * of this function
+				 */
+				if (rc == -ETIMEDOUT)
+					need_soft_reset = true;
+				mutex_unlock(&hpriv->restore_phase_mutex);
+				goto out;
+			}
+		}
+
+		hdev->asic_funcs->restore_phase_topology(hdev);
+
+		if (num_chunks == 0) {
+			dev_dbg(hdev->dev,
+			"Need to run restore phase but restore CS is empty\n");
+			rc = 0;
+		} else {
+			rc = _hl_cs_ioctl(hpriv, chunks, num_chunks,
+						&cs_seq);
+		}
+
+		mutex_unlock(&hpriv->restore_phase_mutex);
+
+		if (rc) {
+			dev_err(hdev->dev,
+				"Failed to submit restore CS for context %d (%d)\n",
+				ctx->asid, rc);
+			goto out;
+		}
+
+		/* Need to wait for restore completion before execution phase */
+		if (num_chunks > 0) {
+			ret = _hl_cs_wait_ioctl(hdev, ctx,
+					jiffies_to_usecs(hdev->timeout_jiffies),
+					cs_seq);
+			if (ret <= 0) {
+				dev_err(hdev->dev,
+					"Restore CS for context %d failed to complete %ld\n",
+					ctx->asid, ret);
+				rc = -ENOEXEC;
+				goto out;
+			}
+		}
+
+		ctx->thread_restore_wait_token = 1;
+	} else if (!ctx->thread_restore_wait_token) {
+		u32 tmp;
+
+		rc = hl_poll_timeout_memory(hdev,
+			(u64) (uintptr_t) &ctx->thread_restore_wait_token,
+			jiffies_to_usecs(hdev->timeout_jiffies),
+			&tmp);
+
+		if (rc || !tmp) {
+			dev_err(hdev->dev,
+				"restore phase hasn't finished in time\n");
+			rc = -ETIMEDOUT;
+			goto out;
+		}
+	}
+
+	chunks = (void __user *)(uintptr_t)args->in.chunks_execute;
+	num_chunks = args->in.num_chunks_execute;
+
+	if (num_chunks == 0) {
+		dev_err(hdev->dev,
+			"Got execute CS with 0 chunks, context %d\n",
+			ctx->asid);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	rc = _hl_cs_ioctl(hpriv, chunks, num_chunks, &cs_seq);
+
+out:
+	if (rc != -EAGAIN) {
+		memset(args, 0, sizeof(*args));
+		args->out.status = rc;
+		args->out.seq = cs_seq;
+	}
+
+	if ((rc == -ETIMEDOUT) && (need_soft_reset))
+		hl_device_reset(hdev, false, false);
+
+	return rc;
+}
+
+static long _hl_cs_wait_ioctl(struct hl_device *hdev,
+		struct hl_ctx *ctx, u64 timeout_us, u64 seq)
+{
+	struct dma_fence *fence;
+	unsigned long timeout;
+	long rc;
+
+	if (timeout_us == MAX_SCHEDULE_TIMEOUT)
+		timeout = timeout_us;
+	else
+		timeout = usecs_to_jiffies(timeout_us);
+
+	hl_ctx_get(hdev, ctx);
+
+	fence = hl_ctx_get_fence(ctx, seq);
+	if (IS_ERR(fence)) {
+		rc = PTR_ERR(fence);
+	} else if (fence) {
+		rc = dma_fence_wait_timeout(fence, true, timeout);
+		if (fence->error == -ETIMEDOUT)
+			rc = -ETIMEDOUT;
+		else if (fence->error == -EIO)
+			rc = -EIO;
+		dma_fence_put(fence);
+	} else
+		rc = 1;
+
+	hl_ctx_put(ctx);
+
+	return rc;
+}
+
+int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data)
+{
+	struct hl_device *hdev = hpriv->hdev;
+	union hl_wait_cs_args *args = data;
+	u64 seq = args->in.seq;
+	long rc;
+
+	rc = _hl_cs_wait_ioctl(hdev, hpriv->ctx, args->in.timeout_us, seq);
+
+	memset(args, 0, sizeof(*args));
+
+	if (rc < 0) {
+		dev_err(hdev->dev, "Error %ld on waiting for CS handle %llu\n",
+			rc, seq);
+		if (rc == -ERESTARTSYS) {
+			args->out.status = HL_WAIT_CS_STATUS_INTERRUPTED;
+			rc = -EINTR;
+		} else if (rc == -ETIMEDOUT) {
+			args->out.status = HL_WAIT_CS_STATUS_TIMEDOUT;
+		} else if (rc == -EIO) {
+			args->out.status = HL_WAIT_CS_STATUS_ABORTED;
+		}
+		return rc;
+	}
+
+	if (rc == 0)
+		args->out.status = HL_WAIT_CS_STATUS_BUSY;
+	else
+		args->out.status = HL_WAIT_CS_STATUS_COMPLETED;
+
+	return 0;
+}
diff --git a/drivers/misc/habanalabs/context.c b/drivers/misc/habanalabs/context.c
index de1258e7a6e6..c3854714b46c 100644
--- a/drivers/misc/habanalabs/context.c
+++ b/drivers/misc/habanalabs/context.c
@@ -12,6 +12,18 @@
 static void hl_ctx_fini(struct hl_ctx *ctx)
 {
 	struct hl_device *hdev = ctx->hdev;
+	int i;
+
+	/*
+	 * If we arrived here, there are no jobs waiting for this context
+	 * on its queues so we can safely remove it.
+	 * This is because for each CS, we increment the ref count and for
+	 * every CS that was finished we decrement it and we won't arrive
+	 * to this function unless the ref count is 0
+	 */
+
+	for (i = 0 ; i < HL_MAX_PENDING_CS ; i++)
+		dma_fence_put(ctx->cs_pending[i]);
 
 	if (ctx->asid != HL_KERNEL_ASID_ID)
 		hl_asid_free(hdev, ctx->asid);
@@ -23,8 +35,6 @@ void hl_ctx_do_release(struct kref *ref)
 
 	ctx = container_of(ref, struct hl_ctx, refcount);
 
-	dev_dbg(ctx->hdev->dev, "Now really releasing context %d\n", ctx->asid);
-
 	hl_ctx_fini(ctx);
 
 	if (ctx->hpriv)
@@ -90,6 +100,11 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)
 
 	kref_init(&ctx->refcount);
 
+	ctx->cs_sequence = 1;
+	spin_lock_init(&ctx->cs_lock);
+	atomic_set(&ctx->thread_restore_token, 1);
+	ctx->thread_restore_wait_token = 0;
+
 	if (is_kernel_ctx) {
 		ctx->asid = HL_KERNEL_ASID_ID; /* KMD gets ASID 0 */
 	} else {
@@ -100,8 +115,6 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)
 		}
 	}
 
-	dev_dbg(hdev->dev, "Created context with ASID %u\n", ctx->asid);
-
 	return 0;
 }
 
@@ -115,6 +128,37 @@ int hl_ctx_put(struct hl_ctx *ctx)
 	return kref_put(&ctx->refcount, hl_ctx_do_release);
 }
 
+struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct dma_fence *fence;
+
+	spin_lock(&ctx->cs_lock);
+
+	if (seq >= ctx->cs_sequence) {
+		dev_notice(hdev->dev,
+			"Can't wait on seq %llu because current CS is at seq %llu\n",
+			seq, ctx->cs_sequence);
+		spin_unlock(&ctx->cs_lock);
+		return ERR_PTR(-EINVAL);
+	}
+
+
+	if (seq + HL_MAX_PENDING_CS < ctx->cs_sequence) {
+		dev_dbg(hdev->dev,
+			"Can't wait on seq %llu because current CS is at seq %llu (Fence is gone)\n",
+			seq, ctx->cs_sequence);
+		spin_unlock(&ctx->cs_lock);
+		return NULL;
+	}
+
+	fence = dma_fence_get(
+			ctx->cs_pending[seq & (HL_MAX_PENDING_CS - 1)]);
+	spin_unlock(&ctx->cs_lock);
+
+	return fence;
+}
+
 /*
  * hl_ctx_mgr_init - initialize the context manager
  *
diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c
index 2aa8a68cdf76..cc5f068df597 100644
--- a/drivers/misc/habanalabs/device.c
+++ b/drivers/misc/habanalabs/device.c
@@ -30,6 +30,8 @@ static void hpriv_release(struct kref *ref)
 
 	put_pid(hpriv->taskpid);
 
+	mutex_destroy(&hpriv->restore_phase_mutex);
+
 	kfree(hpriv);
 
 	/* Now the FD is really closed */
@@ -208,6 +210,8 @@ static int device_early_init(struct hl_device *hdev)
 
 	mutex_init(&hdev->fd_open_cnt_lock);
 	mutex_init(&hdev->send_cpu_message_lock);
+	INIT_LIST_HEAD(&hdev->hw_queues_mirror_list);
+	spin_lock_init(&hdev->hw_queues_mirror_lock);
 	atomic_set(&hdev->in_reset, 0);
 	atomic_set(&hdev->fd_open_cnt, 0);
 
@@ -593,6 +597,9 @@ again:
 	 */
 	hdev->asic_funcs->halt_engines(hdev, hard_reset);
 
+	/* Go over all the queues, release all CS and their jobs */
+	hl_cs_rollback_all(hdev);
+
 	if (hard_reset) {
 		/* Release kernel context */
 		if (hl_ctx_put(hdev->kernel_ctx) != 1) {
@@ -616,6 +623,12 @@ again:
 	for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
 		hl_cq_reset(hdev, &hdev->completion_queue[i]);
 
+	/* Make sure the setup phase for the user context will run again */
+	if (hdev->user_ctx) {
+		atomic_set(&hdev->user_ctx->thread_restore_token, 1);
+		hdev->user_ctx->thread_restore_wait_token = 0;
+	}
+
 	/* Finished tear-down, starting to re-initialize */
 
 	if (hard_reset) {
@@ -952,6 +965,9 @@ void hl_device_fini(struct hl_device *hdev)
 	 */
 	hdev->asic_funcs->halt_engines(hdev, true);
 
+	/* Go over all the queues, release all CS and their jobs */
+	hl_cs_rollback_all(hdev);
+
 	hl_cb_pool_fini(hdev);
 
 	/* Release kernel context */
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 1fe1d6a1ff9e..e3878fd7dc94 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -95,6 +95,19 @@ static const char goya_irq_name[GOYA_MSIX_ENTRIES][GOYA_MAX_STRING_LEN] = {
 		"goya cq 4", "goya cpu eq"
 };
 
+static u16 goya_packet_sizes[MAX_PACKET_ID] = {
+	[PACKET_WREG_32]	= sizeof(struct packet_wreg32),
+	[PACKET_WREG_BULK]	= sizeof(struct packet_wreg_bulk),
+	[PACKET_MSG_LONG]	= sizeof(struct packet_msg_long),
+	[PACKET_MSG_SHORT]	= sizeof(struct packet_msg_short),
+	[PACKET_CP_DMA]		= sizeof(struct packet_cp_dma),
+	[PACKET_MSG_PROT]	= sizeof(struct packet_msg_prot),
+	[PACKET_FENCE]		= sizeof(struct packet_fence),
+	[PACKET_LIN_DMA]	= sizeof(struct packet_lin_dma),
+	[PACKET_NOP]		= sizeof(struct packet_nop),
+	[PACKET_STOP]		= sizeof(struct packet_stop)
+};
+
 static const char *goya_axi_name[GOYA_MAX_INITIATORS] = {
 	"MME0",
 	"MME1",
@@ -2978,6 +2991,84 @@ void *goya_get_int_queue_base(struct hl_device *hdev, u32 queue_id,
 	return base;
 }
 
+int goya_send_job_on_qman0(struct hl_device *hdev, struct hl_cs_job *job)
+{
+	struct goya_device *goya = hdev->asic_specific;
+	struct packet_msg_prot *fence_pkt;
+	u32 *fence_ptr;
+	dma_addr_t fence_dma_addr;
+	struct hl_cb *cb;
+	u32 tmp;
+	int rc;
+
+	if (!hdev->asic_funcs->is_device_idle(hdev)) {
+		dev_err_ratelimited(hdev->dev,
+			"Can't send KMD job on QMAN0 if device is not idle\n");
+		return -EFAULT;
+	}
+
+	fence_ptr = hdev->asic_funcs->dma_pool_zalloc(hdev, 4, GFP_KERNEL,
+							&fence_dma_addr);
+	if (!fence_ptr) {
+		dev_err(hdev->dev,
+			"Failed to allocate fence memory for QMAN0\n");
+		return -ENOMEM;
+	}
+
+	*fence_ptr = 0;
+
+	if (goya->hw_cap_initialized & HW_CAP_MMU) {
+		WREG32(mmDMA_QM_0_GLBL_PROT, QMAN_DMA_FULLY_TRUSTED);
+		RREG32(mmDMA_QM_0_GLBL_PROT);
+	}
+
+	/*
+	 * goya cs parser saves space for 2xpacket_msg_prot at end of CB. For
+	 * synchronized kernel jobs we only need space for 1 packet_msg_prot
+	 */
+	job->job_cb_size -= sizeof(struct packet_msg_prot);
+
+	cb = job->patched_cb;
+
+	fence_pkt = (struct packet_msg_prot *) (uintptr_t) (cb->kernel_address +
+			job->job_cb_size - sizeof(struct packet_msg_prot));
+
+	fence_pkt->ctl = (PACKET_MSG_PROT << GOYA_PKT_CTL_OPCODE_SHIFT) |
+			(1 << GOYA_PKT_CTL_EB_SHIFT) |
+			(1 << GOYA_PKT_CTL_MB_SHIFT);
+	fence_pkt->value = GOYA_QMAN0_FENCE_VAL;
+	fence_pkt->addr = fence_dma_addr +
+			hdev->asic_prop.host_phys_base_address;
+
+	rc = hl_hw_queue_send_cb_no_cmpl(hdev, GOYA_QUEUE_ID_DMA_0,
+					job->job_cb_size, cb->bus_address);
+	if (rc) {
+		dev_err(hdev->dev, "Failed to send CB on QMAN0, %d\n", rc);
+		goto free_fence_ptr;
+	}
+
+	rc = hl_poll_timeout_memory(hdev, (u64) (uintptr_t) fence_ptr,
+					HL_DEVICE_TIMEOUT_USEC, &tmp);
+
+	hl_hw_queue_inc_ci_kernel(hdev, GOYA_QUEUE_ID_DMA_0);
+
+	if ((rc) || (tmp != GOYA_QMAN0_FENCE_VAL)) {
+		dev_err(hdev->dev, "QMAN0 Job hasn't finished in time\n");
+		rc = -ETIMEDOUT;
+	}
+
+free_fence_ptr:
+	hdev->asic_funcs->dma_pool_free(hdev, (void *) fence_ptr,
+					fence_dma_addr);
+
+	if (goya->hw_cap_initialized & HW_CAP_MMU) {
+		WREG32(mmDMA_QM_0_GLBL_PROT, QMAN_DMA_PARTLY_TRUSTED);
+		RREG32(mmDMA_QM_0_GLBL_PROT);
+	}
+
+	return rc;
+}
+
 int goya_send_cpu_message(struct hl_device *hdev, u32 *msg, u16 len,
 				u32 timeout, long *result)
 {
@@ -3214,11 +3305,985 @@ void goya_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size,
 			size);
 }
 
+int goya_dma_map_sg(struct hl_device *hdev, struct scatterlist *sg, int nents,
+			enum dma_data_direction dir)
+{
+	if (!dma_map_sg(&hdev->pdev->dev, sg, nents, dir))
+		return -ENOMEM;
+
+	return 0;
+}
+
+void goya_dma_unmap_sg(struct hl_device *hdev, struct scatterlist *sg,
+			int nents, enum dma_data_direction dir)
+{
+	dma_unmap_sg(&hdev->pdev->dev, sg, nents, dir);
+}
+
+u32 goya_get_dma_desc_list_size(struct hl_device *hdev,
+					struct sg_table *sgt)
+{
+	struct scatterlist *sg, *sg_next_iter;
+	u32 count, len, dma_desc_cnt, len_next;
+	dma_addr_t addr, addr_next;
+
+	dma_desc_cnt = 0;
+
+	for_each_sg(sgt->sgl, sg, sgt->nents, count) {
+
+		len = sg_dma_len(sg);
+		addr = sg_dma_address(sg);
+
+		if (len == 0)
+			break;
+
+		while ((count + 1) < sgt->nents) {
+			sg_next_iter = sg_next(sg);
+			len_next = sg_dma_len(sg_next_iter);
+			addr_next = sg_dma_address(sg_next_iter);
+
+			if (len_next == 0)
+				break;
+
+			if ((addr + len == addr_next) &&
+				(len + len_next <= DMA_MAX_TRANSFER_SIZE)) {
+				len += len_next;
+				count++;
+				sg = sg_next_iter;
+			} else {
+				break;
+			}
+		}
+
+		dma_desc_cnt++;
+	}
+
+	return dma_desc_cnt * sizeof(struct packet_lin_dma);
+}
+
+static int goya_pin_memory_before_cs(struct hl_device *hdev,
+				struct hl_cs_parser *parser,
+				struct packet_lin_dma *user_dma_pkt,
+				u64 addr, enum dma_data_direction dir)
+{
+	struct hl_userptr *userptr;
+	int rc;
+
+	if (hl_userptr_is_pinned(hdev, addr, user_dma_pkt->tsize,
+			parser->job_userptr_list, &userptr))
+		goto already_pinned;
+
+	userptr = kzalloc(sizeof(*userptr), GFP_ATOMIC);
+	if (!userptr)
+		return -ENOMEM;
+
+	rc = hl_pin_host_memory(hdev, addr, user_dma_pkt->tsize, userptr);
+	if (rc)
+		goto free_userptr;
+
+	list_add_tail(&userptr->job_node, parser->job_userptr_list);
+
+	rc = hdev->asic_funcs->asic_dma_map_sg(hdev, userptr->sgt->sgl,
+					userptr->sgt->nents, dir);
+	if (rc) {
+		dev_err(hdev->dev, "failed to map sgt with DMA region\n");
+		goto unpin_memory;
+	}
+
+	userptr->dma_mapped = true;
+	userptr->dir = dir;
+
+already_pinned:
+	parser->patched_cb_size +=
+			goya_get_dma_desc_list_size(hdev, userptr->sgt);
+
+	return 0;
+
+unpin_memory:
+	hl_unpin_host_memory(hdev, userptr);
+free_userptr:
+	kfree(userptr);
+	return rc;
+}
+
+static int goya_validate_dma_pkt_host(struct hl_device *hdev,
+				struct hl_cs_parser *parser,
+				struct packet_lin_dma *user_dma_pkt)
+{
+	u64 device_memory_addr, addr;
+	enum dma_data_direction dir;
+	enum goya_dma_direction user_dir;
+	bool sram_addr = true;
+	bool skip_host_mem_pin = false;
+	bool user_memset;
+	int rc = 0;
+
+	user_dir = (user_dma_pkt->ctl & GOYA_PKT_LIN_DMA_CTL_DMA_DIR_MASK) >>
+			GOYA_PKT_LIN_DMA_CTL_DMA_DIR_SHIFT;
+
+	user_memset = (user_dma_pkt->ctl & GOYA_PKT_LIN_DMA_CTL_MEMSET_MASK) >>
+			GOYA_PKT_LIN_DMA_CTL_MEMSET_SHIFT;
+
+	switch (user_dir) {
+	case DMA_HOST_TO_DRAM:
+		dev_dbg(hdev->dev, "DMA direction is HOST --> DRAM\n");
+		dir = DMA_TO_DEVICE;
+		sram_addr = false;
+		addr = user_dma_pkt->src_addr;
+		device_memory_addr = user_dma_pkt->dst_addr;
+		if (user_memset)
+			skip_host_mem_pin = true;
+		break;
+
+	case DMA_DRAM_TO_HOST:
+		dev_dbg(hdev->dev, "DMA direction is DRAM --> HOST\n");
+		dir = DMA_FROM_DEVICE;
+		sram_addr = false;
+		addr = user_dma_pkt->dst_addr;
+		device_memory_addr = user_dma_pkt->src_addr;
+		break;
+
+	case DMA_HOST_TO_SRAM:
+		dev_dbg(hdev->dev, "DMA direction is HOST --> SRAM\n");
+		dir = DMA_TO_DEVICE;
+		addr = user_dma_pkt->src_addr;
+		device_memory_addr = user_dma_pkt->dst_addr;
+		if (user_memset)
+			skip_host_mem_pin = true;
+		break;
+
+	case DMA_SRAM_TO_HOST:
+		dev_dbg(hdev->dev, "DMA direction is SRAM --> HOST\n");
+		dir = DMA_FROM_DEVICE;
+		addr = user_dma_pkt->dst_addr;
+		device_memory_addr = user_dma_pkt->src_addr;
+		break;
+	default:
+		dev_err(hdev->dev, "DMA direction is undefined\n");
+		return -EFAULT;
+	}
+
+	if (parser->ctx_id != HL_KERNEL_ASID_ID) {
+		if (sram_addr) {
+			if (!hl_mem_area_inside_range(device_memory_addr,
+					user_dma_pkt->tsize,
+					hdev->asic_prop.sram_user_base_address,
+					hdev->asic_prop.sram_end_address)) {
+
+				dev_err(hdev->dev,
+					"SRAM address 0x%llx + 0x%x is invalid\n",
+					device_memory_addr,
+					user_dma_pkt->tsize);
+				return -EFAULT;
+			}
+		} else {
+			if (!hl_mem_area_inside_range(device_memory_addr,
+					user_dma_pkt->tsize,
+					hdev->asic_prop.dram_user_base_address,
+					hdev->asic_prop.dram_end_address)) {
+
+				dev_err(hdev->dev,
+					"DRAM address 0x%llx + 0x%x is invalid\n",
+					device_memory_addr,
+					user_dma_pkt->tsize);
+				return -EFAULT;
+			}
+		}
+	}
+
+	if (skip_host_mem_pin)
+		parser->patched_cb_size += sizeof(*user_dma_pkt);
+	else {
+		if ((dir == DMA_TO_DEVICE) &&
+				(parser->hw_queue_id > GOYA_QUEUE_ID_DMA_1)) {
+			dev_err(hdev->dev,
+				"Can't DMA from host on queue other then 1\n");
+			return -EFAULT;
+		}
+
+		rc = goya_pin_memory_before_cs(hdev, parser, user_dma_pkt,
+						addr, dir);
+	}
+
+	return rc;
+}
+
+static int goya_validate_dma_pkt_no_host(struct hl_device *hdev,
+				struct hl_cs_parser *parser,
+				struct packet_lin_dma *user_dma_pkt)
+{
+	u64 sram_memory_addr, dram_memory_addr;
+	enum goya_dma_direction user_dir;
+
+	user_dir = (user_dma_pkt->ctl & GOYA_PKT_LIN_DMA_CTL_DMA_DIR_MASK) >>
+			GOYA_PKT_LIN_DMA_CTL_DMA_DIR_SHIFT;
+
+	if (user_dir == DMA_DRAM_TO_SRAM) {
+		dev_dbg(hdev->dev, "DMA direction is DRAM --> SRAM\n");
+		dram_memory_addr = user_dma_pkt->src_addr;
+		sram_memory_addr = user_dma_pkt->dst_addr;
+	} else {
+		dev_dbg(hdev->dev, "DMA direction is SRAM --> DRAM\n");
+		sram_memory_addr = user_dma_pkt->src_addr;
+		dram_memory_addr = user_dma_pkt->dst_addr;
+	}
+
+	if (!hl_mem_area_inside_range(sram_memory_addr, user_dma_pkt->tsize,
+				hdev->asic_prop.sram_user_base_address,
+				hdev->asic_prop.sram_end_address)) {
+		dev_err(hdev->dev, "SRAM address 0x%llx + 0x%x is invalid\n",
+			sram_memory_addr, user_dma_pkt->tsize);
+		return -EFAULT;
+	}
+
+	if (!hl_mem_area_inside_range(dram_memory_addr, user_dma_pkt->tsize,
+				hdev->asic_prop.dram_user_base_address,
+				hdev->asic_prop.dram_end_address)) {
+		dev_err(hdev->dev, "DRAM address 0x%llx + 0x%x is invalid\n",
+			dram_memory_addr, user_dma_pkt->tsize);
+		return -EFAULT;
+	}
+
+	parser->patched_cb_size += sizeof(*user_dma_pkt);
+
+	return 0;
+}
+
+static int goya_validate_dma_pkt_no_mmu(struct hl_device *hdev,
+				struct hl_cs_parser *parser,
+				struct packet_lin_dma *user_dma_pkt)
+{
+	enum goya_dma_direction user_dir;
+	int rc;
+
+	dev_dbg(hdev->dev, "DMA packet details:\n");
+	dev_dbg(hdev->dev, "source == 0x%llx\n", user_dma_pkt->src_addr);
+	dev_dbg(hdev->dev, "destination == 0x%llx\n", user_dma_pkt->dst_addr);
+	dev_dbg(hdev->dev, "size == %u\n", user_dma_pkt->tsize);
+
+	user_dir = (user_dma_pkt->ctl & GOYA_PKT_LIN_DMA_CTL_DMA_DIR_MASK) >>
+			GOYA_PKT_LIN_DMA_CTL_DMA_DIR_SHIFT;
+
+	/*
+	 * Special handling for DMA with size 0. The H/W has a bug where
+	 * this can cause the QMAN DMA to get stuck, so block it here.
+	 */
+	if (user_dma_pkt->tsize == 0) {
+		dev_err(hdev->dev,
+			"Got DMA with size 0, might reset the device\n");
+		return -EINVAL;
+	}
+
+	if ((user_dir == DMA_DRAM_TO_SRAM) || (user_dir == DMA_SRAM_TO_DRAM))
+		rc = goya_validate_dma_pkt_no_host(hdev, parser, user_dma_pkt);
+	else
+		rc = goya_validate_dma_pkt_host(hdev, parser, user_dma_pkt);
+
+	return rc;
+}
+
+static int goya_validate_dma_pkt_mmu(struct hl_device *hdev,
+				struct hl_cs_parser *parser,
+				struct packet_lin_dma *user_dma_pkt)
+{
+	dev_dbg(hdev->dev, "DMA packet details:\n");
+	dev_dbg(hdev->dev, "source == 0x%llx\n", user_dma_pkt->src_addr);
+	dev_dbg(hdev->dev, "destination == 0x%llx\n", user_dma_pkt->dst_addr);
+	dev_dbg(hdev->dev, "size == %u\n", user_dma_pkt->tsize);
+
+	/*
+	 * WA for HW-23.
+	 * We can't allow user to read from Host using QMANs other than 1.
+	 */
+	if (parser->hw_queue_id > GOYA_QUEUE_ID_DMA_1 &&
+		hl_mem_area_inside_range(user_dma_pkt->src_addr,
+				user_dma_pkt->tsize,
+				hdev->asic_prop.va_space_host_start_address,
+				hdev->asic_prop.va_space_host_end_address)) {
+		dev_err(hdev->dev,
+			"Can't DMA from host on queue other then 1\n");
+		return -EFAULT;
+	}
+
+	if (user_dma_pkt->tsize == 0) {
+		dev_err(hdev->dev,
+			"Got DMA with size 0, might reset the device\n");
+		return -EINVAL;
+	}
+
+	parser->patched_cb_size += sizeof(*user_dma_pkt);
+
+	return 0;
+}
+
+static int goya_validate_wreg32(struct hl_device *hdev,
+				struct hl_cs_parser *parser,
+				struct packet_wreg32 *wreg_pkt)
+{
+	struct goya_device *goya = hdev->asic_specific;
+	u32 sob_start_addr, sob_end_addr;
+	u16 reg_offset;
+
+	reg_offset = wreg_pkt->ctl & GOYA_PKT_WREG32_CTL_REG_OFFSET_MASK;
+
+	dev_dbg(hdev->dev, "WREG32 packet details:\n");
+	dev_dbg(hdev->dev, "reg_offset == 0x%x\n", reg_offset);
+	dev_dbg(hdev->dev, "value      == 0x%x\n", wreg_pkt->value);
+
+	if (reg_offset != (mmDMA_CH_1_WR_COMP_ADDR_LO & 0xFFFF)) {
+		dev_err(hdev->dev, "WREG32 packet with illegal address 0x%x\n",
+			reg_offset);
+		return -EPERM;
+	}
+
+	/*
+	 * With MMU, DMA channels are not secured, so it doesn't matter where
+	 * the WR COMP will be written to because it will go out with
+	 * non-secured property
+	 */
+	if (goya->hw_cap_initialized & HW_CAP_MMU)
+		return 0;
+
+	sob_start_addr = lower_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0);
+	sob_end_addr = lower_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_1023);
+
+	if ((wreg_pkt->value < sob_start_addr) ||
+			(wreg_pkt->value > sob_end_addr)) {
+
+		dev_err(hdev->dev, "WREG32 packet with illegal value 0x%x\n",
+			wreg_pkt->value);
+		return -EPERM;
+	}
+
+	return 0;
+}
+
+static int goya_validate_cb(struct hl_device *hdev,
+			struct hl_cs_parser *parser, bool is_mmu)
+{
+	u32 cb_parsed_length = 0;
+	int rc = 0;
+
+	parser->patched_cb_size = 0;
+
+	/* cb_user_size is more than 0 so loop will always be executed */
+	while (cb_parsed_length < parser->user_cb_size) {
+		enum packet_id pkt_id;
+		u16 pkt_size;
+		void *user_pkt;
+
+		user_pkt = (void *) (uintptr_t)
+			(parser->user_cb->kernel_address + cb_parsed_length);
+
+		pkt_id = (enum packet_id) (((*(u64 *) user_pkt) &
+				PACKET_HEADER_PACKET_ID_MASK) >>
+					PACKET_HEADER_PACKET_ID_SHIFT);
+
+		pkt_size = goya_packet_sizes[pkt_id];
+		cb_parsed_length += pkt_size;
+		if (cb_parsed_length > parser->user_cb_size) {
+			dev_err(hdev->dev,
+				"packet 0x%x is out of CB boundary\n", pkt_id);
+			rc = -EINVAL;
+			break;
+		}
+
+		switch (pkt_id) {
+		case PACKET_WREG_32:
+			/*
+			 * Although it is validated after copy in patch_cb(),
+			 * need to validate here as well because patch_cb() is
+			 * not called in MMU path while this function is called
+			 */
+			rc = goya_validate_wreg32(hdev, parser, user_pkt);
+			break;
+
+		case PACKET_WREG_BULK:
+			dev_err(hdev->dev,
+				"User not allowed to use WREG_BULK\n");
+			rc = -EPERM;
+			break;
+
+		case PACKET_MSG_PROT:
+			dev_err(hdev->dev,
+				"User not allowed to use MSG_PROT\n");
+			rc = -EPERM;
+			break;
+
+		case PACKET_CP_DMA:
+			dev_err(hdev->dev, "User not allowed to use CP_DMA\n");
+			rc = -EPERM;
+			break;
+
+		case PACKET_STOP:
+			dev_err(hdev->dev, "User not allowed to use STOP\n");
+			rc = -EPERM;
+			break;
+
+		case PACKET_LIN_DMA:
+			if (is_mmu)
+				rc = goya_validate_dma_pkt_mmu(hdev, parser,
+						user_pkt);
+			else
+				rc = goya_validate_dma_pkt_no_mmu(hdev, parser,
+						user_pkt);
+			break;
+
+		case PACKET_MSG_LONG:
+		case PACKET_MSG_SHORT:
+		case PACKET_FENCE:
+		case PACKET_NOP:
+			parser->patched_cb_size += pkt_size;
+			break;
+
+		default:
+			dev_err(hdev->dev, "Invalid packet header 0x%x\n",
+				pkt_id);
+			rc = -EINVAL;
+			break;
+		}
+
+		if (rc)
+			break;
+	}
+
+	/*
+	 * The new CB should have space at the end for two MSG_PROT packets:
+	 * 1. A packet that will act as a completion packet
+	 * 2. A packet that will generate MSI-X interrupt
+	 */
+	parser->patched_cb_size += sizeof(struct packet_msg_prot) * 2;
+
+	return rc;
+}
+
+static int goya_patch_dma_packet(struct hl_device *hdev,
+				struct hl_cs_parser *parser,
+				struct packet_lin_dma *user_dma_pkt,
+				struct packet_lin_dma *new_dma_pkt,
+				u32 *new_dma_pkt_size)
+{
+	struct hl_userptr *userptr;
+	struct scatterlist *sg, *sg_next_iter;
+	u32 count, len, dma_desc_cnt, len_next;
+	dma_addr_t dma_addr, dma_addr_next;
+	enum goya_dma_direction user_dir;
+	u64 device_memory_addr, addr;
+	enum dma_data_direction dir;
+	struct sg_table *sgt;
+	bool skip_host_mem_pin = false;
+	bool user_memset;
+	u32 user_rdcomp_mask, user_wrcomp_mask;
+
+	user_dir = (user_dma_pkt->ctl & GOYA_PKT_LIN_DMA_CTL_DMA_DIR_MASK) >>
+			GOYA_PKT_LIN_DMA_CTL_DMA_DIR_SHIFT;
+
+	user_memset = (user_dma_pkt->ctl & GOYA_PKT_LIN_DMA_CTL_MEMSET_MASK) >>
+			GOYA_PKT_LIN_DMA_CTL_MEMSET_SHIFT;
+
+	if ((user_dir == DMA_DRAM_TO_SRAM) || (user_dir == DMA_SRAM_TO_DRAM) ||
+			(user_dma_pkt->tsize == 0)) {
+		memcpy(new_dma_pkt, user_dma_pkt, sizeof(*new_dma_pkt));
+		*new_dma_pkt_size = sizeof(*new_dma_pkt);
+		return 0;
+	}
+
+	if ((user_dir == DMA_HOST_TO_DRAM) || (user_dir == DMA_HOST_TO_SRAM)) {
+		addr = user_dma_pkt->src_addr;
+		device_memory_addr = user_dma_pkt->dst_addr;
+		dir = DMA_TO_DEVICE;
+		if (user_memset)
+			skip_host_mem_pin = true;
+	} else {
+		addr = user_dma_pkt->dst_addr;
+		device_memory_addr = user_dma_pkt->src_addr;
+		dir = DMA_FROM_DEVICE;
+	}
+
+	if ((!skip_host_mem_pin) &&
+		(hl_userptr_is_pinned(hdev, addr, user_dma_pkt->tsize,
+			parser->job_userptr_list, &userptr) == false)) {
+		dev_err(hdev->dev, "Userptr 0x%llx + 0x%x NOT mapped\n",
+				addr, user_dma_pkt->tsize);
+		return -EFAULT;
+	}
+
+	if ((user_memset) && (dir == DMA_TO_DEVICE)) {
+		memcpy(new_dma_pkt, user_dma_pkt, sizeof(*user_dma_pkt));
+		*new_dma_pkt_size = sizeof(*user_dma_pkt);
+		return 0;
+	}
+
+	user_rdcomp_mask =
+			(user_dma_pkt->ctl & GOYA_PKT_LIN_DMA_CTL_RDCOMP_MASK);
+
+	user_wrcomp_mask =
+			(user_dma_pkt->ctl & GOYA_PKT_LIN_DMA_CTL_WRCOMP_MASK);
+
+	sgt = userptr->sgt;
+	dma_desc_cnt = 0;
+
+	for_each_sg(sgt->sgl, sg, sgt->nents, count) {
+		len = sg_dma_len(sg);
+		dma_addr = sg_dma_address(sg);
+
+		if (len == 0)
+			break;
+
+		while ((count + 1) < sgt->nents) {
+			sg_next_iter = sg_next(sg);
+			len_next = sg_dma_len(sg_next_iter);
+			dma_addr_next = sg_dma_address(sg_next_iter);
+
+			if (len_next == 0)
+				break;
+
+			if ((dma_addr + len == dma_addr_next) &&
+				(len + len_next <= DMA_MAX_TRANSFER_SIZE)) {
+				len += len_next;
+				count++;
+				sg = sg_next_iter;
+			} else {
+				break;
+			}
+		}
+
+		new_dma_pkt->ctl = user_dma_pkt->ctl;
+		if (likely(dma_desc_cnt))
+			new_dma_pkt->ctl &= ~GOYA_PKT_CTL_EB_MASK;
+		new_dma_pkt->ctl &= ~(GOYA_PKT_LIN_DMA_CTL_RDCOMP_MASK |
+					GOYA_PKT_LIN_DMA_CTL_WRCOMP_MASK);
+		new_dma_pkt->tsize = len;
+
+		dma_addr += hdev->asic_prop.host_phys_base_address;
+
+		if (dir == DMA_TO_DEVICE) {
+			new_dma_pkt->src_addr = dma_addr;
+			new_dma_pkt->dst_addr = device_memory_addr;
+		} else {
+			new_dma_pkt->src_addr = device_memory_addr;
+			new_dma_pkt->dst_addr = dma_addr;
+		}
+
+		if (!user_memset)
+			device_memory_addr += len;
+		dma_desc_cnt++;
+		new_dma_pkt++;
+	}
+
+	if (!dma_desc_cnt) {
+		dev_err(hdev->dev,
+			"Error of 0 SG entries when patching DMA packet\n");
+		return -EFAULT;
+	}
+
+	/* Fix the last dma packet - rdcomp/wrcomp must be as user set them */
+	new_dma_pkt--;
+	new_dma_pkt->ctl |= (user_rdcomp_mask | user_wrcomp_mask);
+
+	*new_dma_pkt_size = dma_desc_cnt * sizeof(struct packet_lin_dma);
+
+	return 0;
+}
+
+static int goya_patch_cb(struct hl_device *hdev,
+				struct hl_cs_parser *parser)
+{
+	u32 cb_parsed_length = 0;
+	u32 cb_patched_cur_length = 0;
+	int rc = 0;
+
+	/* cb_user_size is more than 0 so loop will always be executed */
+	while (cb_parsed_length < parser->user_cb_size) {
+		enum packet_id pkt_id;
+		u16 pkt_size;
+		u32 new_pkt_size = 0;
+		void *user_pkt, *kernel_pkt;
+
+		user_pkt = (void *) (uintptr_t)
+			(parser->user_cb->kernel_address + cb_parsed_length);
+		kernel_pkt = (void *) (uintptr_t)
+			(parser->patched_cb->kernel_address +
+					cb_patched_cur_length);
+
+		pkt_id = (enum packet_id) (((*(u64 *) user_pkt) &
+				PACKET_HEADER_PACKET_ID_MASK) >>
+					PACKET_HEADER_PACKET_ID_SHIFT);
+
+		pkt_size = goya_packet_sizes[pkt_id];
+		cb_parsed_length += pkt_size;
+		if (cb_parsed_length > parser->user_cb_size) {
+			dev_err(hdev->dev,
+				"packet 0x%x is out of CB boundary\n", pkt_id);
+			rc = -EINVAL;
+			break;
+		}
+
+		switch (pkt_id) {
+		case PACKET_LIN_DMA:
+			rc = goya_patch_dma_packet(hdev, parser, user_pkt,
+						kernel_pkt, &new_pkt_size);
+			cb_patched_cur_length += new_pkt_size;
+			break;
+
+		case PACKET_WREG_32:
+			memcpy(kernel_pkt, user_pkt, pkt_size);
+			cb_patched_cur_length += pkt_size;
+			rc = goya_validate_wreg32(hdev, parser, kernel_pkt);
+			break;
+
+		case PACKET_WREG_BULK:
+			dev_err(hdev->dev,
+				"User not allowed to use WREG_BULK\n");
+			rc = -EPERM;
+			break;
+
+		case PACKET_MSG_PROT:
+			dev_err(hdev->dev,
+				"User not allowed to use MSG_PROT\n");
+			rc = -EPERM;
+			break;
+
+		case PACKET_CP_DMA:
+			dev_err(hdev->dev, "User not allowed to use CP_DMA\n");
+			rc = -EPERM;
+			break;
+
+		case PACKET_STOP:
+			dev_err(hdev->dev, "User not allowed to use STOP\n");
+			rc = -EPERM;
+			break;
+
+		case PACKET_MSG_LONG:
+		case PACKET_MSG_SHORT:
+		case PACKET_FENCE:
+		case PACKET_NOP:
+			memcpy(kernel_pkt, user_pkt, pkt_size);
+			cb_patched_cur_length += pkt_size;
+			break;
+
+		default:
+			dev_err(hdev->dev, "Invalid packet header 0x%x\n",
+				pkt_id);
+			rc = -EINVAL;
+			break;
+		}
+
+		if (rc)
+			break;
+	}
+
+	return rc;
+}
+
+static int goya_parse_cb_mmu(struct hl_device *hdev,
+		struct hl_cs_parser *parser)
+{
+	u64 patched_cb_handle;
+	u32 patched_cb_size;
+	struct hl_cb *user_cb;
+	int rc;
+
+	/*
+	 * The new CB should have space at the end for two MSG_PROT pkt:
+	 * 1. A packet that will act as a completion packet
+	 * 2. A packet that will generate MSI-X interrupt
+	 */
+	parser->patched_cb_size = parser->user_cb_size +
+			sizeof(struct packet_msg_prot) * 2;
+
+	rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr,
+				parser->patched_cb_size,
+				&patched_cb_handle, HL_KERNEL_ASID_ID);
+
+	if (rc) {
+		dev_err(hdev->dev,
+			"Failed to allocate patched CB for DMA CS %d\n",
+			rc);
+		return rc;
+	}
+
+	patched_cb_handle >>= PAGE_SHIFT;
+	parser->patched_cb = hl_cb_get(hdev, &hdev->kernel_cb_mgr,
+				(u32) patched_cb_handle);
+	/* hl_cb_get should never fail here so use kernel WARN */
+	WARN(!parser->patched_cb, "DMA CB handle invalid 0x%x\n",
+			(u32) patched_cb_handle);
+	if (!parser->patched_cb) {
+		rc = -EFAULT;
+		goto out;
+	}
+
+	/*
+	 * The check that parser->user_cb_size <= parser->user_cb->size was done
+	 * in validate_queue_index().
+	 */
+	memcpy((void *) (uintptr_t) parser->patched_cb->kernel_address,
+		(void *) (uintptr_t) parser->user_cb->kernel_address,
+		parser->user_cb_size);
+
+	patched_cb_size = parser->patched_cb_size;
+
+	/* validate patched CB instead of user CB */
+	user_cb = parser->user_cb;
+	parser->user_cb = parser->patched_cb;
+	rc = goya_validate_cb(hdev, parser, true);
+	parser->user_cb = user_cb;
+
+	if (rc) {
+		hl_cb_put(parser->patched_cb);
+		goto out;
+	}
+
+	if (patched_cb_size != parser->patched_cb_size) {
+		dev_err(hdev->dev, "user CB size mismatch\n");
+		hl_cb_put(parser->patched_cb);
+		rc = -EINVAL;
+		goto out;
+	}
+
+out:
+	/*
+	 * Always call cb destroy here because we still have 1 reference
+	 * to it by calling cb_get earlier. After the job will be completed,
+	 * cb_put will release it, but here we want to remove it from the
+	 * idr
+	 */
+	hl_cb_destroy(hdev, &hdev->kernel_cb_mgr,
+					patched_cb_handle << PAGE_SHIFT);
+
+	return rc;
+}
+
+int goya_parse_cb_no_mmu(struct hl_device *hdev, struct hl_cs_parser *parser)
+{
+	u64 patched_cb_handle;
+	int rc;
+
+	rc = goya_validate_cb(hdev, parser, false);
+
+	if (rc)
+		goto free_userptr;
+
+	rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr,
+				parser->patched_cb_size,
+				&patched_cb_handle, HL_KERNEL_ASID_ID);
+	if (rc) {
+		dev_err(hdev->dev,
+			"Failed to allocate patched CB for DMA CS %d\n", rc);
+		goto free_userptr;
+	}
+
+	patched_cb_handle >>= PAGE_SHIFT;
+	parser->patched_cb = hl_cb_get(hdev, &hdev->kernel_cb_mgr,
+				(u32) patched_cb_handle);
+	/* hl_cb_get should never fail here so use kernel WARN */
+	WARN(!parser->patched_cb, "DMA CB handle invalid 0x%x\n",
+			(u32) patched_cb_handle);
+	if (!parser->patched_cb) {
+		rc = -EFAULT;
+		goto out;
+	}
+
+	rc = goya_patch_cb(hdev, parser);
+
+	if (rc)
+		hl_cb_put(parser->patched_cb);
+
+out:
+	/*
+	 * Always call cb destroy here because we still have 1 reference
+	 * to it by calling cb_get earlier. After the job will be completed,
+	 * cb_put will release it, but here we want to remove it from the
+	 * idr
+	 */
+	hl_cb_destroy(hdev, &hdev->kernel_cb_mgr,
+				patched_cb_handle << PAGE_SHIFT);
+
+free_userptr:
+	if (rc)
+		hl_userptr_delete_list(hdev, parser->job_userptr_list);
+	return rc;
+}
+
+int goya_parse_cb_no_ext_quque(struct hl_device *hdev,
+		struct hl_cs_parser *parser)
+{
+	struct asic_fixed_properties *asic_prop = &hdev->asic_prop;
+	struct goya_device *goya = hdev->asic_specific;
+
+	if (!(goya->hw_cap_initialized & HW_CAP_MMU)) {
+		/* For internal queue jobs, just check if cb address is valid */
+		if (hl_mem_area_inside_range(
+				(u64) (uintptr_t) parser->user_cb,
+				parser->user_cb_size,
+				asic_prop->sram_user_base_address,
+				asic_prop->sram_end_address))
+			return 0;
+
+		if (hl_mem_area_inside_range(
+				(u64) (uintptr_t) parser->user_cb,
+				parser->user_cb_size,
+				asic_prop->dram_user_base_address,
+				asic_prop->dram_end_address))
+			return 0;
+
+		dev_err(hdev->dev,
+			"Internal CB address 0x%llx + 0x%x is not in SRAM nor in DRAM\n",
+			(u64) (uintptr_t) parser->user_cb,
+			parser->user_cb_size);
+
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+int goya_cs_parser(struct hl_device *hdev, struct hl_cs_parser *parser)
+{
+	struct goya_device *goya = hdev->asic_specific;
+
+	if (!parser->ext_queue)
+		return goya_parse_cb_no_ext_quque(hdev, parser);
+
+	if ((goya->hw_cap_initialized & HW_CAP_MMU) && parser->use_virt_addr)
+		return goya_parse_cb_mmu(hdev, parser);
+	else
+		return goya_parse_cb_no_mmu(hdev, parser);
+}
+
+void goya_add_end_of_cb_packets(u64 kernel_address, u32 len, u64 cq_addr,
+				u32 cq_val, u32 msix_vec)
+{
+	struct packet_msg_prot *cq_pkt;
+
+	cq_pkt = (struct packet_msg_prot *) (uintptr_t)
+		(kernel_address + len - (sizeof(struct packet_msg_prot) * 2));
+
+	cq_pkt->ctl = (PACKET_MSG_PROT << GOYA_PKT_CTL_OPCODE_SHIFT) |
+			(1 << GOYA_PKT_CTL_EB_SHIFT) |
+			(1 << GOYA_PKT_CTL_MB_SHIFT);
+	cq_pkt->value = cq_val;
+	cq_pkt->addr = cq_addr;
+
+	cq_pkt++;
+
+	cq_pkt->ctl = (PACKET_MSG_PROT << GOYA_PKT_CTL_OPCODE_SHIFT) |
+			(1 << GOYA_PKT_CTL_MB_SHIFT);
+	cq_pkt->value = msix_vec & 0x7FF;
+	cq_pkt->addr = CFG_BASE + mmPCIE_DBI_MSIX_DOORBELL_OFF;
+}
+
 static void goya_update_eq_ci(struct hl_device *hdev, u32 val)
 {
 	WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_6, val);
 }
 
+int goya_context_switch(struct hl_device *hdev, u32 asid)
+{
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	struct packet_lin_dma *clear_sram_pkt;
+	struct hl_cs_parser parser;
+	struct hl_cs_job *job;
+	u32 cb_size;
+	struct hl_cb *cb;
+	int rc;
+
+	cb = hl_cb_kernel_create(hdev, PAGE_SIZE);
+	if (!cb)
+		return -EFAULT;
+
+	clear_sram_pkt = (struct packet_lin_dma *)
+					(uintptr_t) cb->kernel_address;
+
+	memset(clear_sram_pkt, 0, sizeof(*clear_sram_pkt));
+	cb_size = sizeof(*clear_sram_pkt);
+
+	clear_sram_pkt->ctl = ((PACKET_LIN_DMA << GOYA_PKT_CTL_OPCODE_SHIFT) |
+		(DMA_HOST_TO_SRAM << GOYA_PKT_LIN_DMA_CTL_DMA_DIR_SHIFT) |
+		(1 << GOYA_PKT_LIN_DMA_CTL_MEMSET_SHIFT) |
+		(1 << GOYA_PKT_LIN_DMA_CTL_WO_SHIFT) |
+		(1 << GOYA_PKT_CTL_RB_SHIFT) |
+		(1 << GOYA_PKT_CTL_MB_SHIFT));
+
+	clear_sram_pkt->src_addr = 0x7777777777777777ull;
+	clear_sram_pkt->dst_addr = prop->sram_base_address;
+	if (hdev->pldm)
+		clear_sram_pkt->tsize = 0x10000;
+	else
+		clear_sram_pkt->tsize = prop->sram_size;
+
+	job = hl_cs_allocate_job(hdev, true);
+	if (!job) {
+		dev_err(hdev->dev, "Failed to allocate a new job\n");
+		rc = -ENOMEM;
+		goto release_cb;
+	}
+
+	job->id = 0;
+	job->user_cb = cb;
+	job->user_cb->cs_cnt++;
+	job->user_cb_size = cb_size;
+	job->hw_queue_id = GOYA_QUEUE_ID_DMA_0;
+
+	parser.ctx_id = HL_KERNEL_ASID_ID;
+	parser.cs_sequence = 0;
+	parser.job_id = job->id;
+	parser.hw_queue_id = job->hw_queue_id;
+	parser.job_userptr_list = &job->userptr_list;
+	parser.user_cb = job->user_cb;
+	parser.user_cb_size = job->user_cb_size;
+	parser.ext_queue = job->ext_queue;
+	parser.use_virt_addr = hdev->mmu_enable;
+
+	rc = hdev->asic_funcs->cs_parser(hdev, &parser);
+	if (rc) {
+		dev_err(hdev->dev,
+			"Failed to parse kernel CB during context switch\n");
+		goto free_job;
+	}
+
+	job->patched_cb = parser.patched_cb;
+	job->job_cb_size = parser.patched_cb_size;
+	job->patched_cb->cs_cnt++;
+
+	rc = goya_send_job_on_qman0(hdev, job);
+
+	job->patched_cb->cs_cnt--;
+	hl_cb_put(job->patched_cb);
+
+free_job:
+	hl_userptr_delete_list(hdev, &job->userptr_list);
+	kfree(job);
+	cb->cs_cnt--;
+
+release_cb:
+	hl_cb_put(cb);
+	hl_cb_destroy(hdev, &hdev->kernel_cb_mgr, cb->id << PAGE_SHIFT);
+
+	return rc;
+}
+
+void goya_restore_phase_topology(struct hl_device *hdev)
+{
+	int i, num_of_sob_in_longs, num_of_mon_in_longs;
+
+	num_of_sob_in_longs =
+		((mmSYNC_MNGR_SOB_OBJ_1023 - mmSYNC_MNGR_SOB_OBJ_0) + 4);
+
+	num_of_mon_in_longs =
+		((mmSYNC_MNGR_MON_STATUS_255 - mmSYNC_MNGR_MON_STATUS_0) + 4);
+
+	for (i = 0 ; i < num_of_sob_in_longs ; i += 4)
+		WREG32(mmSYNC_MNGR_SOB_OBJ_0 + i, 0);
+
+	for (i = 0 ; i < num_of_mon_in_longs ; i += 4)
+		WREG32(mmSYNC_MNGR_MON_STATUS_0 + i, 0);
+
+	/* Flush all WREG to prevent race */
+	i = RREG32(mmSYNC_MNGR_SOB_OBJ_0);
+}
+
 static void goya_get_axi_name(struct hl_device *hdev, u32 agent_id,
 		u16 event_type, char *axi_name, int len)
 {
@@ -3608,6 +4673,59 @@ static void goya_disable_clock_gating(struct hl_device *hdev)
 
 }
 
+static bool goya_is_device_idle(struct hl_device *hdev)
+{
+	u64 offset, dma_qm_reg, tpc_qm_reg, tpc_cmdq_reg, tpc_cfg_reg;
+	int i;
+
+	offset = mmDMA_QM_1_GLBL_STS0 - mmDMA_QM_0_GLBL_STS0;
+
+	for (i = 0 ; i < DMA_MAX_NUM ; i++) {
+		dma_qm_reg = mmDMA_QM_0_GLBL_STS0 + i * offset;
+
+		if ((RREG32(dma_qm_reg) & DMA_QM_IDLE_MASK) !=
+				DMA_QM_IDLE_MASK)
+			return false;
+	}
+
+	offset = mmTPC1_QM_GLBL_STS0 - mmTPC0_QM_GLBL_STS0;
+
+	for (i = 0 ; i < TPC_MAX_NUM ; i++) {
+		tpc_qm_reg = mmTPC0_QM_GLBL_STS0 + i * offset;
+		tpc_cmdq_reg = mmTPC0_CMDQ_GLBL_STS0 + i * offset;
+		tpc_cfg_reg = mmTPC0_CFG_STATUS + i * offset;
+
+		if ((RREG32(tpc_qm_reg) & TPC_QM_IDLE_MASK) !=
+				TPC_QM_IDLE_MASK)
+			return false;
+
+		if ((RREG32(tpc_cmdq_reg) & TPC_CMDQ_IDLE_MASK) !=
+				TPC_CMDQ_IDLE_MASK)
+			return false;
+
+		if ((RREG32(tpc_cfg_reg) & TPC_CFG_IDLE_MASK) !=
+				TPC_CFG_IDLE_MASK)
+			return false;
+	}
+
+	if ((RREG32(mmMME_QM_GLBL_STS0) & MME_QM_IDLE_MASK) !=
+			MME_QM_IDLE_MASK)
+		return false;
+
+	if ((RREG32(mmMME_CMDQ_GLBL_STS0) & MME_CMDQ_IDLE_MASK) !=
+			MME_CMDQ_IDLE_MASK)
+		return false;
+
+	if ((RREG32(mmMME_ARCH_STATUS) & MME_ARCH_IDLE_MASK) !=
+			MME_ARCH_IDLE_MASK)
+		return false;
+
+	if (RREG32(mmMME_SHADOW_0_STATUS) & MME_SHADOW_IDLE_MASK)
+		return false;
+
+	return true;
+}
+
 static void goya_hw_queues_lock(struct hl_device *hdev)
 {
 	struct goya_device *goya = hdev->asic_specific;
@@ -3700,7 +4818,14 @@ static const struct hl_asic_funcs goya_funcs = {
 	.dma_pool_free = goya_dma_pool_free,
 	.cpu_accessible_dma_pool_alloc = goya_cpu_accessible_dma_pool_alloc,
 	.cpu_accessible_dma_pool_free = goya_cpu_accessible_dma_pool_free,
+	.hl_dma_unmap_sg = goya_dma_unmap_sg,
+	.cs_parser = goya_cs_parser,
+	.asic_dma_map_sg = goya_dma_map_sg,
+	.get_dma_desc_list_size = goya_get_dma_desc_list_size,
+	.add_end_of_cb_packets = goya_add_end_of_cb_packets,
 	.update_eq_ci = goya_update_eq_ci,
+	.context_switch = goya_context_switch,
+	.restore_phase_topology = goya_restore_phase_topology,
 	.add_device_attr = goya_add_device_attr,
 	.handle_eqe = goya_handle_eqe,
 	.set_pll_profile = goya_set_pll_profile,
@@ -3708,6 +4833,7 @@ static const struct hl_asic_funcs goya_funcs = {
 	.send_heartbeat = goya_send_heartbeat,
 	.enable_clock_gating = goya_init_clock_gating,
 	.disable_clock_gating = goya_disable_clock_gating,
+	.is_device_idle = goya_is_device_idle,
 	.soft_reset_late_init = goya_soft_reset_late_init,
 	.hw_queues_lock = goya_hw_queues_lock,
 	.hw_queues_unlock = goya_hw_queues_unlock,
diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index 744e37bbc2a6..9adc7c6ec08b 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -16,6 +16,9 @@
 #include <linux/cdev.h>
 #include <linux/iopoll.h>
 #include <linux/irqreturn.h>
+#include <linux/dma-fence.h>
+#include <linux/dma-direction.h>
+#include <linux/scatterlist.h>
 
 #define HL_NAME				"habanalabs"
 
@@ -31,6 +34,11 @@
 
 #define HL_MAX_QUEUES			128
 
+#define HL_MAX_JOBS_PER_CS		64
+
+/* MUST BE POWER OF 2 and larger than 1 */
+#define HL_MAX_PENDING_CS		64
+
 struct hl_device;
 struct hl_fpriv;
 
@@ -62,6 +70,16 @@ struct hw_queue_properties {
 };
 
 /**
+ * enum vm_type_t - virtual memory mapping request information.
+ * @VM_TYPE_USERPTR: mapping of user memory to device virtual address.
+ * @VM_TYPE_PHYS_LIST: mapping of DRAM memory to device virtual address.
+ */
+enum vm_type_t {
+	VM_TYPE_USERPTR,
+	VM_TYPE_PHYS_LIST
+};
+
+/**
  * enum hl_device_hw_state - H/W device state. use this to understand whether
  *                           to do reset before hw_init or not
  * @HL_DEVICE_HW_STATE_CLEAN: H/W state is clean. i.e. after hard reset
@@ -147,6 +165,19 @@ struct asic_fixed_properties {
 	u8			tpc_enabled_mask;
 };
 
+/**
+ * struct hl_dma_fence - wrapper for fence object used by command submissions.
+ * @base_fence: kernel fence object.
+ * @lock: spinlock to protect fence.
+ * @hdev: habanalabs device structure.
+ * @cs_seq: command submission sequence number.
+ */
+struct hl_dma_fence {
+	struct dma_fence	base_fence;
+	spinlock_t		lock;
+	struct hl_device	*hdev;
+	u64			cs_seq;
+};
 
 /*
  * Command Buffers
@@ -175,6 +206,7 @@ struct hl_cb_mgr {
  * @mmap_size: Holds the CB's size that was mmaped.
  * @size: holds the CB's size.
  * @id: the CB's ID.
+ * @cs_cnt: holds number of CS that this CB participates in.
  * @ctx_id: holds the ID of the owner's context.
  * @mmap: true if the CB is currently mmaped to user.
  * @is_pool: true if CB was acquired from the pool, false otherwise.
@@ -189,6 +221,7 @@ struct hl_cb {
 	u32			mmap_size;
 	u32			size;
 	u32			id;
+	u32			cs_cnt;
 	u32			ctx_id;
 	u8			mmap;
 	u8			is_pool;
@@ -313,6 +346,8 @@ enum hl_asic_type {
 	ASIC_INVALID
 };
 
+struct hl_cs_parser;
+
 /**
  * enum hl_pm_mng_profile - power management profile.
  * @PM_AUTO: internal clock is set by KMD.
@@ -372,7 +407,14 @@ enum hl_pll_frequency {
  * @dma_pool_free: free small DMA allocation from pool.
  * @cpu_accessible_dma_pool_alloc: allocate CPU PQ packet from DMA pool.
  * @cpu_accessible_dma_pool_free: free CPU PQ packet from DMA pool.
+ * @hl_dma_unmap_sg: DMA unmap scatter-gather list.
+ * @cs_parser: parse Command Submission.
+ * @asic_dma_map_sg: DMA map scatter-gather list.
+ * @get_dma_desc_list_size: get number of LIN_DMA packets required for CB.
+ * @add_end_of_cb_packets: Add packets to the end of CB, if device requires it.
  * @update_eq_ci: update event queue CI.
+ * @context_switch: called upon ASID context switch.
+ * @restore_phase_topology: clear all SOBs amd MONs.
  * @add_device_attr: add ASIC specific device attributes.
  * @handle_eqe: handle event queue entry (IRQ) from ArmCP.
  * @set_pll_profile: change PLL profile (manual/automatic).
@@ -380,6 +422,7 @@ enum hl_pll_frequency {
  * @send_heartbeat: send is-alive packet to ArmCP and verify response.
  * @enable_clock_gating: enable clock gating for reducing power consumption.
  * @disable_clock_gating: disable clock for accessing registers on HBW.
+ * @is_device_idle: return true if device is idle, false otherwise.
  * @soft_reset_late_init: perform certain actions needed after soft reset.
  * @hw_queues_lock: acquire H/W queues lock.
  * @hw_queues_unlock: release H/W queues lock.
@@ -419,7 +462,20 @@ struct hl_asic_funcs {
 				size_t size, dma_addr_t *dma_handle);
 	void (*cpu_accessible_dma_pool_free)(struct hl_device *hdev,
 				size_t size, void *vaddr);
+	void (*hl_dma_unmap_sg)(struct hl_device *hdev,
+				struct scatterlist *sg, int nents,
+				enum dma_data_direction dir);
+	int (*cs_parser)(struct hl_device *hdev, struct hl_cs_parser *parser);
+	int (*asic_dma_map_sg)(struct hl_device *hdev,
+				struct scatterlist *sg, int nents,
+				enum dma_data_direction dir);
+	u32 (*get_dma_desc_list_size)(struct hl_device *hdev,
+					struct sg_table *sgt);
+	void (*add_end_of_cb_packets)(u64 kernel_address, u32 len, u64 cq_addr,
+					u32 cq_val, u32 msix_num);
 	void (*update_eq_ci)(struct hl_device *hdev, u32 val);
+	int (*context_switch)(struct hl_device *hdev, u32 asid);
+	void (*restore_phase_topology)(struct hl_device *hdev);
 	void (*add_device_attr)(struct hl_device *hdev,
 				struct attribute_group *dev_attr_grp);
 	void (*handle_eqe)(struct hl_device *hdev,
@@ -430,6 +486,7 @@ struct hl_asic_funcs {
 	int (*send_heartbeat)(struct hl_device *hdev);
 	void (*enable_clock_gating)(struct hl_device *hdev);
 	void (*disable_clock_gating)(struct hl_device *hdev);
+	bool (*is_device_idle)(struct hl_device *hdev);
 	int (*soft_reset_late_init)(struct hl_device *hdev);
 	void (*hw_queues_lock)(struct hl_device *hdev);
 	void (*hw_queues_unlock)(struct hl_device *hdev);
@@ -453,12 +510,28 @@ struct hl_asic_funcs {
  * @hdev: pointer to the device structure.
  * @refcount: reference counter for the context. Context is released only when
  *		this hits 0l. It is incremented on CS and CS_WAIT.
+ * @cs_pending: array of DMA fence objects representing pending CS.
+ * @cs_sequence: sequence number for CS. Value is assigned to a CS and passed
+ *			to user so user could inquire about CS. It is used as
+ *			index to cs_pending array.
+ * @cs_lock: spinlock to protect cs_sequence.
+ * @thread_restore_token: token to prevent multiple threads of the same context
+ *				from running the restore phase. Only one thread
+ *				should run it.
+ * @thread_restore_wait_token: token to prevent the threads that didn't run
+ *				the restore phase from moving to their execution
+ *				phase before the restore phase has finished.
  * @asid: context's unique address space ID in the device's MMU.
  */
 struct hl_ctx {
 	struct hl_fpriv		*hpriv;
 	struct hl_device	*hdev;
 	struct kref		refcount;
+	struct dma_fence	*cs_pending[HL_MAX_PENDING_CS];
+	u64			cs_sequence;
+	spinlock_t		cs_lock;
+	atomic_t		thread_restore_token;
+	u32			thread_restore_wait_token;
 	u32			asid;
 };
 
@@ -473,14 +546,129 @@ struct hl_ctx_mgr {
 };
 
 
+
+/*
+ * COMMAND SUBMISSIONS
+ */
+
+/**
+ * struct hl_userptr - memory mapping chunk information
+ * @vm_type: type of the VM.
+ * @job_node: linked-list node for hanging the object on the Job's list.
+ * @vec: pointer to the frame vector.
+ * @sgt: pointer to the scatter-gather table that holds the pages.
+ * @dir: for DMA unmapping, the direction must be supplied, so save it.
+ * @debugfs_list: node in debugfs list of command submissions.
+ * @addr: user-space virtual pointer to the start of the memory area.
+ * @size: size of the memory area to pin & map.
+ * @dma_mapped: true if the SG was mapped to DMA addresses, false otherwise.
+ */
+struct hl_userptr {
+	enum vm_type_t		vm_type; /* must be first */
+	struct list_head	job_node;
+	struct frame_vector	*vec;
+	struct sg_table		*sgt;
+	enum dma_data_direction dir;
+	struct list_head	debugfs_list;
+	u64			addr;
+	u32			size;
+	u8			dma_mapped;
+};
+
+/**
+ * struct hl_cs - command submission.
+ * @jobs_in_queue_cnt: per each queue, maintain counter of submitted jobs.
+ * @ctx: the context this CS belongs to.
+ * @job_list: list of the CS's jobs in the various queues.
+ * @job_lock: spinlock for the CS's jobs list. Needed for free_job.
+ * @refcount: reference counter for usage of the CS.
+ * @fence: pointer to the fence object of this CS.
+ * @work_tdr: delayed work node for TDR.
+ * @mirror_node : node in device mirror list of command submissions.
+ * @sequence: the sequence number of this CS.
+ * @submitted: true if CS was submitted to H/W.
+ * @completed: true if CS was completed by device.
+ * @timedout : true if CS was timedout.
+ * @tdr_active: true if TDR was activated for this CS (to prevent
+ *		double TDR activation).
+ * @aborted: true if CS was aborted due to some device error.
+ */
+struct hl_cs {
+	u8			jobs_in_queue_cnt[HL_MAX_QUEUES];
+	struct hl_ctx		*ctx;
+	struct list_head	job_list;
+	spinlock_t		job_lock;
+	struct kref		refcount;
+	struct dma_fence	*fence;
+	struct delayed_work	work_tdr;
+	struct list_head	mirror_node;
+	u64			sequence;
+	u8			submitted;
+	u8			completed;
+	u8			timedout;
+	u8			tdr_active;
+	u8			aborted;
+};
+
 /**
  * struct hl_cs_job - command submission job.
+ * @cs_node: the node to hang on the CS jobs list.
+ * @cs: the CS this job belongs to.
+ * @user_cb: the CB we got from the user.
+ * @patched_cb: in case of patching, this is internal CB which is submitted on
+ *		the queue instead of the CB we got from the IOCTL.
  * @finish_work: workqueue object to run when job is completed.
+ * @userptr_list: linked-list of userptr mappings that belong to this job and
+ *			wait for completion.
  * @id: the id of this job inside a CS.
+ * @hw_queue_id: the id of the H/W queue this job is submitted to.
+ * @user_cb_size: the actual size of the CB we got from the user.
+ * @job_cb_size: the actual size of the CB that we put on the queue.
+ * @ext_queue: whether the job is for external queue or internal queue.
  */
 struct hl_cs_job {
+	struct list_head	cs_node;
+	struct hl_cs		*cs;
+	struct hl_cb		*user_cb;
+	struct hl_cb		*patched_cb;
 	struct work_struct	finish_work;
+	struct list_head	userptr_list;
 	u32			id;
+	u32			hw_queue_id;
+	u32			user_cb_size;
+	u32			job_cb_size;
+	u8			ext_queue;
+};
+
+/**
+ * struct hl_cs_parser - command submission paerser properties.
+ * @user_cb: the CB we got from the user.
+ * @patched_cb: in case of patching, this is internal CB which is submitted on
+ *		the queue instead of the CB we got from the IOCTL.
+ * @job_userptr_list: linked-list of userptr mappings that belong to the related
+ *			job and wait for completion.
+ * @cs_sequence: the sequence number of the related CS.
+ * @ctx_id: the ID of the context the related CS belongs to.
+ * @hw_queue_id: the id of the H/W queue this job is submitted to.
+ * @user_cb_size: the actual size of the CB we got from the user.
+ * @patched_cb_size: the size of the CB after parsing.
+ * @ext_queue: whether the job is for external queue or internal queue.
+ * @job_id: the id of the related job inside the related CS.
+ * @use_virt_addr: whether to treat the addresses in the CB as virtual during
+ *			parsing.
+ */
+struct hl_cs_parser {
+	struct hl_cb		*user_cb;
+	struct hl_cb		*patched_cb;
+	struct list_head	*job_userptr_list;
+	u64			cs_sequence;
+	u32			ctx_id;
+	u32			hw_queue_id;
+	u32			user_cb_size;
+	u32			patched_cb_size;
+	u8			ext_queue;
+	u8			job_id;
+	u8			use_virt_addr;
 };
 
 
@@ -497,6 +685,7 @@ struct hl_cs_job {
  * @ctx_mgr: context manager to handle multiple context for this FD.
  * @cb_mgr: command buffer manager to handle multiple buffers for this FD.
  * @refcount: number of related contexts.
+ * @restore_phase_mutex: lock for context switch and restore phase.
  */
 struct hl_fpriv {
 	struct hl_device	*hdev;
@@ -506,6 +695,7 @@ struct hl_fpriv {
 	struct hl_ctx_mgr	ctx_mgr;
 	struct hl_cb_mgr	cb_mgr;
 	struct kref		refcount;
+	struct mutex		restore_phase_mutex;
 };
 
 
@@ -577,6 +767,8 @@ struct hl_device_reset_work {
  * @eq_wq: work queue of event queue for executing work in process context.
  * @kernel_ctx: KMD context structure.
  * @kernel_queues: array of hl_hw_queue.
+ * @hw_queues_mirror_list: CS mirror list for TDR.
+ * @hw_queues_mirror_lock: protects hw_queues_mirror_list.
  * @kernel_cb_mgr: command buffer manager for creating/destroying/handling CGs.
  * @event_queue: event queue for IRQ from ArmCP.
  * @dma_pool: DMA pool for small allocations.
@@ -604,6 +796,7 @@ struct hl_device_reset_work {
  * @in_reset: is device in reset flow.
  * @curr_pll_profile: current PLL profile.
  * @fd_open_cnt: number of open user processes.
+ * @timeout_jiffies: device CS timeout value.
  * @max_power: the max power of the device, as configured by the sysadmin. This
  *             value is saved so in case of hard-reset, KMD will restore this
  *             value and update the F/W after the re-initialization
@@ -617,7 +810,10 @@ struct hl_device_reset_work {
  * @hwmon_initialized: is H/W monitor sensors was initialized.
  * @hard_reset_pending: is there a hard reset work pending.
  * @heartbeat: is heartbeat sanity check towards ArmCP enabled.
+ * @reset_on_lockup: true if a reset should be done in case of stuck CS, false
+ *                   otherwise.
  * @init_done: is the initialization of the device done.
+ * @mmu_enable: is MMU enabled.
  */
 struct hl_device {
 	struct pci_dev			*pdev;
@@ -634,6 +830,8 @@ struct hl_device {
 	struct workqueue_struct		*eq_wq;
 	struct hl_ctx			*kernel_ctx;
 	struct hl_hw_queue		*kernel_queues;
+	struct list_head		hw_queues_mirror_list;
+	spinlock_t			hw_queues_mirror_lock;
 	struct hl_cb_mgr		kernel_cb_mgr;
 	struct hl_eq			event_queue;
 	struct dma_pool			*dma_pool;
@@ -661,6 +859,7 @@ struct hl_device {
 	atomic_t			in_reset;
 	atomic_t			curr_pll_profile;
 	atomic_t			fd_open_cnt;
+	u64				timeout_jiffies;
 	u64				max_power;
 	u32				major;
 	u32				high_pll;
@@ -672,9 +871,11 @@ struct hl_device {
 	u8				hwmon_initialized;
 	u8				hard_reset_pending;
 	u8				heartbeat;
+	u8				reset_on_lockup;
 	u8				init_done;
 
 	/* Parameters for bring-up */
+	u8				mmu_enable;
 	u8				cpu_enable;
 	u8				reset_pcilink;
 	u8				cpu_queues_enable;
@@ -712,6 +913,58 @@ struct hl_ioctl_desc {
  * Kernel module functions that can be accessed by entire module
  */
 
+/**
+ * hl_mem_area_inside_range() - Checks whether address+size are inside a range.
+ * @address: The start address of the area we want to validate.
+ * @size: The size in bytes of the area we want to validate.
+ * @range_start_address: The start address of the valid range.
+ * @range_end_address: The end address of the valid range.
+ *
+ * Return: true if the area is inside the valid range, false otherwise.
+ */
+static inline bool hl_mem_area_inside_range(u64 address, u32 size,
+				u64 range_start_address, u64 range_end_address)
+{
+	u64 end_address = address + size;
+
+	if ((address >= range_start_address) &&
+			(end_address <= range_end_address) &&
+			(end_address > address))
+		return true;
+
+	return false;
+}
+
+/**
+ * hl_mem_area_crosses_range() - Checks whether address+size crossing a range.
+ * @address: The start address of the area we want to validate.
+ * @size: The size in bytes of the area we want to validate.
+ * @range_start_address: The start address of the valid range.
+ * @range_end_address: The end address of the valid range.
+ *
+ * Return: true if the area overlaps part or all of the valid range,
+ *		false otherwise.
+ */
+static inline bool hl_mem_area_crosses_range(u64 address, u32 size,
+				u64 range_start_address, u64 range_end_address)
+{
+	u64 end_address = address + size;
+
+	if ((address >= range_start_address) &&
+			(address < range_end_address))
+		return true;
+
+	if ((end_address >= range_start_address) &&
+			(end_address < range_end_address))
+		return true;
+
+	if ((address < range_start_address) &&
+			(end_address >= range_end_address))
+		return true;
+
+	return false;
+}
+
 int hl_device_open(struct inode *inode, struct file *filp);
 bool hl_device_disabled_or_in_reset(struct hl_device *hdev);
 int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
@@ -725,8 +978,10 @@ int hl_hw_queues_create(struct hl_device *hdev);
 void hl_hw_queues_destroy(struct hl_device *hdev);
 int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id,
 				u32 cb_size, u64 cb_ptr);
+int hl_hw_queue_schedule_cs(struct hl_cs *cs);
 u32 hl_hw_queue_add_ptr(u32 ptr, u16 val);
 void hl_hw_queue_inc_ci_kernel(struct hl_device *hdev, u32 hw_queue_id);
+void hl_int_hw_queue_update_ci(struct hl_cs *cs);
 void hl_hw_queue_reset(struct hl_device *hdev, bool hard_reset);
 
 #define hl_queue_inc_ptr(p)		hl_hw_queue_add_ptr(p, 1)
@@ -740,6 +995,8 @@ void hl_cq_reset(struct hl_device *hdev, struct hl_cq *q);
 void hl_eq_reset(struct hl_device *hdev, struct hl_eq *q);
 irqreturn_t hl_irq_handler_cq(int irq, void *arg);
 irqreturn_t hl_irq_handler_eq(int irq, void *arg);
+u32 hl_cq_inc_ptr(u32 ptr);
+
 int hl_asid_init(struct hl_device *hdev);
 void hl_asid_fini(struct hl_device *hdev);
 unsigned long hl_asid_alloc(struct hl_device *hdev);
@@ -748,9 +1005,13 @@ void hl_asid_free(struct hl_device *hdev, unsigned long asid);
 int hl_ctx_create(struct hl_device *hdev, struct hl_fpriv *hpriv);
 void hl_ctx_free(struct hl_device *hdev, struct hl_ctx *ctx);
 int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx);
+void hl_ctx_do_release(struct kref *ref);
+void hl_ctx_get(struct hl_device *hdev,	struct hl_ctx *ctx);
 int hl_ctx_put(struct hl_ctx *ctx);
+struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq);
 void hl_ctx_mgr_init(struct hl_ctx_mgr *mgr);
 void hl_ctx_mgr_fini(struct hl_device *hdev, struct hl_ctx_mgr *mgr);
+
 int hl_device_init(struct hl_device *hdev, struct class *hclass);
 void hl_device_fini(struct hl_device *hdev);
 int hl_device_suspend(struct hl_device *hdev);
@@ -782,8 +1043,20 @@ struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size);
 int hl_cb_pool_init(struct hl_device *hdev);
 int hl_cb_pool_fini(struct hl_device *hdev);
 
+void hl_cs_rollback_all(struct hl_device *hdev);
+struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev, bool ext_queue);
+
 void goya_set_asic_funcs(struct hl_device *hdev);
 
+int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u32 size,
+			struct hl_userptr *userptr);
+int hl_unpin_host_memory(struct hl_device *hdev, struct hl_userptr *userptr);
+void hl_userptr_delete_list(struct hl_device *hdev,
+				struct list_head *userptr_list);
+bool hl_userptr_is_pinned(struct hl_device *hdev, u64 addr, u32 size,
+				struct list_head *userptr_list,
+				struct hl_userptr **userptr);
+
 long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr);
 void hl_set_frequency(struct hl_device *hdev, u32 pll_index, u64 freq);
 long hl_get_temperature(struct hl_device *hdev, int sensor_index, u32 attr);
@@ -799,5 +1072,7 @@ void hl_set_max_power(struct hl_device *hdev, u64 value);
 /* IOCTLs */
 long hl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 int hl_cb_ioctl(struct hl_fpriv *hpriv, void *data);
+int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data);
+int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data);
 
 #endif /* HABANALABSP_H_ */
diff --git a/drivers/misc/habanalabs/habanalabs_drv.c b/drivers/misc/habanalabs/habanalabs_drv.c
index b0bf77af1e40..77a1cc85e530 100644
--- a/drivers/misc/habanalabs/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/habanalabs_drv.c
@@ -24,6 +24,17 @@ static struct class *hl_class;
 DEFINE_IDR(hl_devs_idr);
 DEFINE_MUTEX(hl_devs_idr_lock);
 
+static int timeout_locked = 5;
+static int reset_on_lockup = 1;
+
+module_param(timeout_locked, int, 0444);
+MODULE_PARM_DESC(timeout_locked,
+	"Device lockup timeout in seconds (0 = disabled, default 5s)");
+
+module_param(reset_on_lockup, int, 0444);
+MODULE_PARM_DESC(reset_on_lockup,
+	"Do device reset on lockup (0 = no, 1 = yes, default yes)");
+
 #define PCI_VENDOR_ID_HABANALABS	0x1da3
 
 #define PCI_IDS_GOYA			0x0001
@@ -113,6 +124,7 @@ int hl_device_open(struct inode *inode, struct file *filp)
 	hpriv->hdev = hdev;
 	filp->private_data = hpriv;
 	hpriv->filp = filp;
+	mutex_init(&hpriv->restore_phase_mutex);
 	kref_init(&hpriv->refcount);
 	nonseekable_open(inode, filp);
 
@@ -140,6 +152,7 @@ out_err:
 	filp->private_data = NULL;
 	hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
 	hl_cb_mgr_fini(hpriv->hdev, &hpriv->cb_mgr);
+	mutex_destroy(&hpriv->restore_phase_mutex);
 	kfree(hpriv);
 
 close_device:
@@ -172,8 +185,10 @@ int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
 		return -ENOMEM;
 
 	hdev->major = hl_major;
+	hdev->reset_on_lockup = reset_on_lockup;
 
 	/* Parameters for bring-up - set them to defaults */
+	hdev->mmu_enable = 0;
 	hdev->cpu_enable = 1;
 	hdev->reset_pcilink = 0;
 	hdev->cpu_queues_enable = 1;
@@ -193,6 +208,11 @@ int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
 	if (!hdev->cpu_queues_enable)
 		hdev->heartbeat = 0;
 
+	if (timeout_locked)
+		hdev->timeout_jiffies = msecs_to_jiffies(timeout_locked * 1000);
+	else
+		hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT;
+
 	hdev->disabled = true;
 	hdev->pdev = pdev; /* can be NULL in case of simulator device */
 
diff --git a/drivers/misc/habanalabs/habanalabs_ioctl.c b/drivers/misc/habanalabs/habanalabs_ioctl.c
index e56a51f6bab6..481db1a5e97e 100644
--- a/drivers/misc/habanalabs/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/habanalabs_ioctl.c
@@ -16,7 +16,9 @@
 	[_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func}
 
 static const struct hl_ioctl_desc hl_ioctls[] = {
-	HL_IOCTL_DEF(HL_IOCTL_CB, hl_cb_ioctl)
+	HL_IOCTL_DEF(HL_IOCTL_CB, hl_cb_ioctl),
+	HL_IOCTL_DEF(HL_IOCTL_CS, hl_cs_ioctl),
+	HL_IOCTL_DEF(HL_IOCTL_WAIT_CS, hl_cs_wait_ioctl)
 };
 
 #define HL_CORE_IOCTL_COUNT	ARRAY_SIZE(hl_ioctls)
diff --git a/drivers/misc/habanalabs/hw_queue.c b/drivers/misc/habanalabs/hw_queue.c
index 2ec43f36cdb8..68dfda59a875 100644
--- a/drivers/misc/habanalabs/hw_queue.c
+++ b/drivers/misc/habanalabs/hw_queue.c
@@ -34,6 +34,29 @@ static inline int queue_free_slots(struct hl_hw_queue *q, u32 queue_len)
 		return (abs(delta) - queue_len);
 }
 
+void hl_int_hw_queue_update_ci(struct hl_cs *cs)
+{
+	struct hl_device *hdev = cs->ctx->hdev;
+	struct hl_hw_queue *q;
+	int i;
+
+	hdev->asic_funcs->hw_queues_lock(hdev);
+
+	if (hdev->disabled)
+		goto out;
+
+	q = &hdev->kernel_queues[0];
+	for (i = 0 ; i < HL_MAX_QUEUES ; i++, q++) {
+		if (q->queue_type == QUEUE_TYPE_INT) {
+			q->ci += cs->jobs_in_queue_cnt[i];
+			q->ci &= ((q->int_queue_len << 1) - 1);
+		}
+	}
+
+out:
+	hdev->asic_funcs->hw_queues_unlock(hdev);
+}
+
 /*
  * ext_queue_submit_bd - Submit a buffer descriptor to an external queue
  *
@@ -120,6 +143,37 @@ static int ext_queue_sanity_checks(struct hl_device *hdev,
 }
 
 /*
+ * int_queue_sanity_checks - perform some sanity checks on internal queue
+ *
+ * @hdev              : pointer to hl_device structure
+ * @q                 :	pointer to hl_hw_queue structure
+ * @num_of_entries    : how many entries to check for space
+ *
+ * H/W queues spinlock should be taken before calling this function
+ *
+ * Perform the following:
+ * - Make sure we have enough space in the h/w queue
+ *
+ */
+static int int_queue_sanity_checks(struct hl_device *hdev,
+					struct hl_hw_queue *q,
+					int num_of_entries)
+{
+	int free_slots_cnt;
+
+	/* Check we have enough space in the queue */
+	free_slots_cnt = queue_free_slots(q, q->int_queue_len);
+
+	if (free_slots_cnt < num_of_entries) {
+		dev_dbg(hdev->dev, "Queue %d doesn't have room for %d CBs\n",
+			q->hw_queue_id, num_of_entries);
+		return -EAGAIN;
+	}
+
+	return 0;
+}
+
+/*
  * hl_hw_queue_send_cb_no_cmpl - send a single CB (not a JOB) without completion
  *
  * @hdev: pointer to hl_device structure
@@ -166,6 +220,184 @@ out:
 }
 
 /*
+ * ext_hw_queue_schedule_job - submit an JOB to an external queue
+ *
+ * @job: pointer to the job that needs to be submitted to the queue
+ *
+ * This function must be called when the scheduler mutex is taken
+ *
+ */
+static void ext_hw_queue_schedule_job(struct hl_cs_job *job)
+{
+	struct hl_device *hdev = job->cs->ctx->hdev;
+	struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id];
+	struct hl_cq_entry cq_pkt;
+	struct hl_cq *cq;
+	u64 cq_addr;
+	struct hl_cb *cb;
+	u32 ctl;
+	u32 len;
+	u64 ptr;
+
+	/*
+	 * Update the JOB ID inside the BD CTL so the device would know what
+	 * to write in the completion queue
+	 */
+	ctl = ((q->pi << BD_CTL_SHADOW_INDEX_SHIFT) & BD_CTL_SHADOW_INDEX_MASK);
+
+	cb = job->patched_cb;
+	len = job->job_cb_size;
+	ptr = cb->bus_address;
+
+	cq_pkt.data = (q->pi << CQ_ENTRY_SHADOW_INDEX_SHIFT)
+					& CQ_ENTRY_SHADOW_INDEX_MASK;
+	cq_pkt.data |= 1 << CQ_ENTRY_SHADOW_INDEX_VALID_SHIFT;
+	cq_pkt.data |= 1 << CQ_ENTRY_READY_SHIFT;
+
+	/*
+	 * No need to protect pi_offset because scheduling to the
+	 * H/W queues is done under the scheduler mutex
+	 *
+	 * No need to check if CQ is full because it was already
+	 * checked in hl_queue_sanity_checks
+	 */
+	cq = &hdev->completion_queue[q->hw_queue_id];
+	cq_addr = cq->bus_address +
+			hdev->asic_prop.host_phys_base_address;
+	cq_addr += cq->pi * sizeof(struct hl_cq_entry);
+
+	hdev->asic_funcs->add_end_of_cb_packets(cb->kernel_address, len,
+				cq_addr, cq_pkt.data, q->hw_queue_id);
+
+	q->shadow_queue[hl_pi_2_offset(q->pi)] = job;
+
+	cq->pi = hl_cq_inc_ptr(cq->pi);
+
+	ext_queue_submit_bd(hdev, q, ctl, len, ptr);
+}
+
+/*
+ * int_hw_queue_schedule_job - submit an JOB to an internal queue
+ *
+ * @job: pointer to the job that needs to be submitted to the queue
+ *
+ * This function must be called when the scheduler mutex is taken
+ *
+ */
+static void int_hw_queue_schedule_job(struct hl_cs_job *job)
+{
+	struct hl_device *hdev = job->cs->ctx->hdev;
+	struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id];
+	struct hl_bd bd;
+	u64 *pi, *pbd = (u64 *) &bd;
+
+	bd.ctl = 0;
+	bd.len = job->job_cb_size;
+	bd.ptr = (u64) (uintptr_t) job->user_cb;
+
+	pi = (u64 *) (uintptr_t) (q->kernel_address +
+		((q->pi & (q->int_queue_len - 1)) * sizeof(bd)));
+
+	pi[0] = pbd[0];
+	pi[1] = pbd[1];
+
+	q->pi++;
+	q->pi &= ((q->int_queue_len << 1) - 1);
+
+	/* Flush PQ entry write. Relevant only for specific ASICs */
+	hdev->asic_funcs->flush_pq_write(hdev, pi, pbd[0]);
+
+	hdev->asic_funcs->ring_doorbell(hdev, q->hw_queue_id, q->pi);
+}
+
+/*
+ * hl_hw_queue_schedule_cs - schedule a command submission
+ *
+ * @job        : pointer to the CS
+ *
+ */
+int hl_hw_queue_schedule_cs(struct hl_cs *cs)
+{
+	struct hl_device *hdev = cs->ctx->hdev;
+	struct hl_cs_job *job, *tmp;
+	struct hl_hw_queue *q;
+	int rc = 0, i, cq_cnt;
+
+	hdev->asic_funcs->hw_queues_lock(hdev);
+
+	if (hl_device_disabled_or_in_reset(hdev)) {
+		dev_err(hdev->dev,
+			"device is disabled or in reset, CS rejected!\n");
+		rc = -EPERM;
+		goto out;
+	}
+
+	q = &hdev->kernel_queues[0];
+	/* This loop assumes all external queues are consecutive */
+	for (i = 0, cq_cnt = 0 ; i < HL_MAX_QUEUES ; i++, q++) {
+		if (q->queue_type == QUEUE_TYPE_EXT) {
+			if (cs->jobs_in_queue_cnt[i]) {
+				rc = ext_queue_sanity_checks(hdev, q,
+					cs->jobs_in_queue_cnt[i], true);
+				if (rc)
+					goto unroll_cq_resv;
+				cq_cnt++;
+			}
+		} else if (q->queue_type == QUEUE_TYPE_INT) {
+			if (cs->jobs_in_queue_cnt[i]) {
+				rc = int_queue_sanity_checks(hdev, q,
+					cs->jobs_in_queue_cnt[i]);
+				if (rc)
+					goto unroll_cq_resv;
+			}
+		}
+	}
+
+	spin_lock(&hdev->hw_queues_mirror_lock);
+	list_add_tail(&cs->mirror_node, &hdev->hw_queues_mirror_list);
+
+	/* Queue TDR if the CS is the first entry and if timeout is wanted */
+	if ((hdev->timeout_jiffies != MAX_SCHEDULE_TIMEOUT) &&
+			(list_first_entry(&hdev->hw_queues_mirror_list,
+					struct hl_cs, mirror_node) == cs)) {
+		cs->tdr_active = true;
+		schedule_delayed_work(&cs->work_tdr, hdev->timeout_jiffies);
+		spin_unlock(&hdev->hw_queues_mirror_lock);
+	} else {
+		spin_unlock(&hdev->hw_queues_mirror_lock);
+	}
+
+	list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node) {
+		if (job->ext_queue)
+			ext_hw_queue_schedule_job(job);
+		else
+			int_hw_queue_schedule_job(job);
+	}
+
+	cs->submitted = true;
+
+	goto out;
+
+unroll_cq_resv:
+	/* This loop assumes all external queues are consecutive */
+	q = &hdev->kernel_queues[0];
+	for (i = 0 ; (i < HL_MAX_QUEUES) && (cq_cnt > 0) ; i++, q++) {
+		if ((q->queue_type == QUEUE_TYPE_EXT) &&
+				(cs->jobs_in_queue_cnt[i])) {
+			atomic_t *free_slots =
+				&hdev->completion_queue[i].free_slots_cnt;
+			atomic_add(cs->jobs_in_queue_cnt[i], free_slots);
+			cq_cnt--;
+		}
+	}
+
+out:
+	hdev->asic_funcs->hw_queues_unlock(hdev);
+
+	return rc;
+}
+
+/*
  * hl_hw_queue_inc_ci_kernel - increment ci for kernel's queue
  *
  * @hdev: pointer to hl_device structure
diff --git a/drivers/misc/habanalabs/memory.c b/drivers/misc/habanalabs/memory.c
new file mode 100644
index 000000000000..ad14376a1c25
--- /dev/null
+++ b/drivers/misc/habanalabs/memory.c
@@ -0,0 +1,198 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright 2016-2019 HabanaLabs, Ltd.
+ * All Rights Reserved.
+ */
+
+#include "habanalabs.h"
+
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+
+/*
+ * hl_pin_host_memory - pins a chunk of host memory
+ *
+ * @hdev                : pointer to the habanalabs device structure
+ * @addr                : the user-space virtual address of the memory area
+ * @size                : the size of the memory area
+ * @userptr	        : pointer to hl_userptr structure
+ *
+ * This function does the following:
+ * - Pins the physical pages
+ * - Create a SG list from those pages
+ */
+int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u32 size,
+			struct hl_userptr *userptr)
+{
+	u64 start, end;
+	u32 npages, offset;
+	int rc;
+
+	if (!size) {
+		dev_err(hdev->dev, "size to pin is invalid - %d\n",
+			size);
+		return -EINVAL;
+	}
+
+	if (!access_ok((void __user *) (uintptr_t) addr, size)) {
+		dev_err(hdev->dev, "user pointer is invalid - 0x%llx\n",
+			addr);
+		return -EFAULT;
+	}
+
+	/*
+	 * If the combination of the address and size requested for this memory
+	 * region causes an integer overflow, return error.
+	 */
+	if (((addr + size) < addr) ||
+			PAGE_ALIGN(addr + size) < (addr + size)) {
+		dev_err(hdev->dev,
+			"user pointer 0x%llx + %u causes integer overflow\n",
+			addr, size);
+		return -EINVAL;
+	}
+
+	start = addr & PAGE_MASK;
+	offset = addr & ~PAGE_MASK;
+	end = PAGE_ALIGN(addr + size);
+	npages = (end - start) >> PAGE_SHIFT;
+
+	userptr->size = size;
+	userptr->addr = addr;
+	userptr->dma_mapped = false;
+	INIT_LIST_HEAD(&userptr->job_node);
+
+	userptr->vec = frame_vector_create(npages);
+	if (!userptr->vec) {
+		dev_err(hdev->dev, "Failed to create frame vector\n");
+		return -ENOMEM;
+	}
+
+	rc = get_vaddr_frames(start, npages, FOLL_FORCE | FOLL_WRITE,
+				userptr->vec);
+
+	if (rc != npages) {
+		dev_err(hdev->dev,
+			"Failed to map host memory, user ptr probably wrong\n");
+		if (rc < 0)
+			goto destroy_framevec;
+		rc = -EFAULT;
+		goto put_framevec;
+	}
+
+	if (frame_vector_to_pages(userptr->vec) < 0) {
+		dev_err(hdev->dev,
+			"Failed to translate frame vector to pages\n");
+		rc = -EFAULT;
+		goto put_framevec;
+	}
+
+	userptr->sgt = kzalloc(sizeof(*userptr->sgt), GFP_ATOMIC);
+	if (!userptr->sgt) {
+		rc = -ENOMEM;
+		goto put_framevec;
+	}
+
+	rc = sg_alloc_table_from_pages(userptr->sgt,
+					frame_vector_pages(userptr->vec),
+					npages, offset, size, GFP_ATOMIC);
+	if (rc < 0) {
+		dev_err(hdev->dev, "failed to create SG table from pages\n");
+		goto free_sgt;
+	}
+
+	return 0;
+
+free_sgt:
+	kfree(userptr->sgt);
+put_framevec:
+	put_vaddr_frames(userptr->vec);
+destroy_framevec:
+	frame_vector_destroy(userptr->vec);
+	return rc;
+}
+
+/*
+ * hl_unpin_host_memory - unpins a chunk of host memory
+ *
+ * @hdev                : pointer to the habanalabs device structure
+ * @userptr             : pointer to hl_userptr structure
+ *
+ * This function does the following:
+ * - Unpins the physical pages related to the host memory
+ * - Free the SG list
+ */
+int hl_unpin_host_memory(struct hl_device *hdev, struct hl_userptr *userptr)
+{
+	struct page **pages;
+
+	if (userptr->dma_mapped)
+		hdev->asic_funcs->hl_dma_unmap_sg(hdev,
+				userptr->sgt->sgl,
+				userptr->sgt->nents,
+				userptr->dir);
+
+	pages = frame_vector_pages(userptr->vec);
+	if (!IS_ERR(pages)) {
+		int i;
+
+		for (i = 0; i < frame_vector_count(userptr->vec); i++)
+			set_page_dirty_lock(pages[i]);
+	}
+	put_vaddr_frames(userptr->vec);
+	frame_vector_destroy(userptr->vec);
+
+	list_del(&userptr->job_node);
+
+	sg_free_table(userptr->sgt);
+	kfree(userptr->sgt);
+
+	return 0;
+}
+
+/*
+ * hl_userptr_delete_list - clear userptr list
+ *
+ * @hdev                : pointer to the habanalabs device structure
+ * @userptr_list        : pointer to the list to clear
+ *
+ * This function does the following:
+ * - Iterates over the list and unpins the host memory and frees the userptr
+ *   structure.
+ */
+void hl_userptr_delete_list(struct hl_device *hdev,
+				struct list_head *userptr_list)
+{
+	struct hl_userptr *userptr, *tmp;
+
+	list_for_each_entry_safe(userptr, tmp, userptr_list, job_node) {
+		hl_unpin_host_memory(hdev, userptr);
+		kfree(userptr);
+	}
+
+	INIT_LIST_HEAD(userptr_list);
+}
+
+/*
+ * hl_userptr_is_pinned - returns whether the given userptr is pinned
+ *
+ * @hdev                : pointer to the habanalabs device structure
+ * @userptr_list        : pointer to the list to clear
+ * @userptr             : pointer to userptr to check
+ *
+ * This function does the following:
+ * - Iterates over the list and checks if the given userptr is in it, means is
+ *   pinned. If so, returns true, otherwise returns false.
+ */
+bool hl_userptr_is_pinned(struct hl_device *hdev, u64 addr,
+				u32 size, struct list_head *userptr_list,
+				struct hl_userptr **userptr)
+{
+	list_for_each_entry((*userptr), userptr_list, job_node) {
+		if ((addr == (*userptr)->addr) && (size == (*userptr)->size))
+			return true;
+	}
+
+	return false;
+}
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 756266cf0416..fba49417f607 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -74,6 +74,95 @@ union hl_cb_args {
 };
 
 /*
+ * This structure size must always be fixed to 64-bytes for backward
+ * compatibility
+ */
+struct hl_cs_chunk {
+	/*
+	 * For external queue, this represents a Handle of CB on the Host
+	 * For internal queue, this represents an SRAM or DRAM address of the
+	 * internal CB
+	 */
+	__u64 cb_handle;
+	/* Index of queue to put the CB on */
+	__u32 queue_index;
+	/*
+	 * Size of command buffer with valid packets
+	 * Can be smaller then actual CB size
+	 */
+	__u32 cb_size;
+	/* HL_CS_CHUNK_FLAGS_* */
+	__u32 cs_chunk_flags;
+	/* Align structure to 64 bytes */
+	__u32 pad[11];
+};
+
+#define HL_CS_FLAGS_FORCE_RESTORE	0x1
+
+#define HL_CS_STATUS_SUCCESS		0
+
+struct hl_cs_in {
+	/* this holds address of array of hl_cs_chunk for restore phase */
+	__u64 chunks_restore;
+	/* this holds address of array of hl_cs_chunk for execution phase */
+	__u64 chunks_execute;
+	/* this holds address of array of hl_cs_chunk for store phase -
+	 * Currently not in use
+	 */
+	__u64 chunks_store;
+	/* Number of chunks in restore phase array */
+	__u32 num_chunks_restore;
+	/* Number of chunks in execution array */
+	__u32 num_chunks_execute;
+	/* Number of chunks in restore phase array - Currently not in use */
+	__u32 num_chunks_store;
+	/* HL_CS_FLAGS_* */
+	__u32 cs_flags;
+	/* Context ID - Currently not in use */
+	__u32 ctx_id;
+};
+
+struct hl_cs_out {
+	/* this holds the sequence number of the CS to pass to wait ioctl */
+	__u64 seq;
+	/* HL_CS_STATUS_* */
+	__u32 status;
+	__u32 pad;
+};
+
+union hl_cs_args {
+	struct hl_cs_in in;
+	struct hl_cs_out out;
+};
+
+struct hl_wait_cs_in {
+	/* Command submission sequence number */
+	__u64 seq;
+	/* Absolute timeout to wait in microseconds */
+	__u64 timeout_us;
+	/* Context ID - Currently not in use */
+	__u32 ctx_id;
+	__u32 pad;
+};
+
+#define HL_WAIT_CS_STATUS_COMPLETED	0
+#define HL_WAIT_CS_STATUS_BUSY		1
+#define HL_WAIT_CS_STATUS_TIMEDOUT	2
+#define HL_WAIT_CS_STATUS_ABORTED	3
+#define HL_WAIT_CS_STATUS_INTERRUPTED	4
+
+struct hl_wait_cs_out {
+	/* HL_WAIT_CS_STATUS_* */
+	__u32 status;
+	__u32 pad;
+};
+
+union hl_wait_cs_args {
+	struct hl_wait_cs_in in;
+	struct hl_wait_cs_out out;
+};
+
+/*
  * Command Buffer
  * - Request a Command Buffer
  * - Destroy a Command Buffer
@@ -89,7 +178,74 @@ union hl_cb_args {
 #define HL_IOCTL_CB		\
 		_IOWR('H', 0x02, union hl_cb_args)
 
+/*
+ * Command Submission
+ *
+ * To submit work to the device, the user need to call this IOCTL with a set
+ * of JOBS. That set of JOBS constitutes a CS object.
+ * Each JOB will be enqueued on a specific queue, according to the user's input.
+ * There can be more then one JOB per queue.
+ *
+ * There are two types of queues - external and internal. External queues
+ * are DMA queues which transfer data from/to the Host. All other queues are
+ * internal. The driver will get completion notifications from the device only
+ * on JOBS which are enqueued in the external queues.
+ *
+ * This IOCTL is asynchronous in regard to the actual execution of the CS. This
+ * means it returns immediately after ALL the JOBS were enqueued on their
+ * relevant queues. Therefore, the user mustn't assume the CS has been completed
+ * or has even started to execute.
+ *
+ * Upon successful enqueue, the IOCTL returns an opaque handle which the user
+ * can use with the "Wait for CS" IOCTL to check whether the handle's CS
+ * external JOBS have been completed. Note that if the CS has internal JOBS
+ * which can execute AFTER the external JOBS have finished, the driver might
+ * report that the CS has finished executing BEFORE the internal JOBS have
+ * actually finish executing.
+ *
+ * The CS IOCTL will receive three sets of JOBS. One set is for "restore" phase,
+ * a second set is for "execution" phase and a third set is for "store" phase.
+ * The JOBS on the "restore" phase are enqueued only after context-switch
+ * (or if its the first CS for this context). The user can also order the
+ * driver to run the "restore" phase explicitly
+ *
+ */
+#define HL_IOCTL_CS			\
+		_IOWR('H', 0x03, union hl_cs_args)
+
+/*
+ * Wait for Command Submission
+ *
+ * The user can call this IOCTL with a handle it received from the CS IOCTL
+ * to wait until the handle's CS has finished executing. The user will wait
+ * inside the kernel until the CS has finished or until the user-requeusted
+ * timeout has expired.
+ *
+ * The return value of the IOCTL is a standard Linux error code. The possible
+ * values are:
+ *
+ * EINTR     - Kernel waiting has been interrupted, e.g. due to OS signal
+ *             that the user process received
+ * ETIMEDOUT - The CS has caused a timeout on the device
+ * EIO       - The CS was aborted (usually because the device was reset)
+ * ENODEV    - The device wants to do hard-reset (so user need to close FD)
+ *
+ * The driver also returns a custom define inside the IOCTL which can be:
+ *
+ * HL_WAIT_CS_STATUS_COMPLETED   - The CS has been completed successfully (0)
+ * HL_WAIT_CS_STATUS_BUSY        - The CS is still executing (0)
+ * HL_WAIT_CS_STATUS_TIMEDOUT    - The CS has caused a timeout on the device
+ *                                 (ETIMEDOUT)
+ * HL_WAIT_CS_STATUS_ABORTED     - The CS was aborted, usually because the
+ *                                 device was reset (EIO)
+ * HL_WAIT_CS_STATUS_INTERRUPTED - Waiting for the CS was interrupted (EINTR)
+ *
+ */
+
+#define HL_IOCTL_WAIT_CS			\
+		_IOWR('H', 0x04, union hl_wait_cs_args)
+
 #define HL_COMMAND_START	0x02
-#define HL_COMMAND_END		0x03
+#define HL_COMMAND_END		0x05
 
 #endif /* HABANALABS_H_ */