summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--drivers/misc/habanalabs/Makefile3
-rw-r--r--drivers/misc/habanalabs/command_submission.c766
-rw-r--r--drivers/misc/habanalabs/context.c52
-rw-r--r--drivers/misc/habanalabs/device.c16
-rw-r--r--drivers/misc/habanalabs/goya/goya.c1126
-rw-r--r--drivers/misc/habanalabs/habanalabs.h275
-rw-r--r--drivers/misc/habanalabs/habanalabs_drv.c20
-rw-r--r--drivers/misc/habanalabs/habanalabs_ioctl.c4
-rw-r--r--drivers/misc/habanalabs/hw_queue.c232
-rw-r--r--drivers/misc/habanalabs/memory.c198
-rw-r--r--include/uapi/misc/habanalabs.h158
11 files changed, 2843 insertions, 7 deletions
diff --git a/drivers/misc/habanalabs/Makefile b/drivers/misc/habanalabs/Makefile
index b5607233d216..d2fd0e18b1eb 100644
--- a/drivers/misc/habanalabs/Makefile
+++ b/drivers/misc/habanalabs/Makefile
@@ -5,7 +5,8 @@
obj-m := habanalabs.o
habanalabs-y := habanalabs_drv.o device.o context.o asid.o habanalabs_ioctl.o \
- command_buffer.o hw_queue.o irq.o sysfs.o hwmon.o
+ command_buffer.o hw_queue.o irq.o sysfs.o hwmon.o memory.o \
+ command_submission.o
include $(src)/goya/Makefile
habanalabs-y += $(HL_GOYA_FILES)
diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c
new file mode 100644
index 000000000000..ae68b97e428d
--- /dev/null
+++ b/drivers/misc/habanalabs/command_submission.c
@@ -0,0 +1,766 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright 2016-2019 HabanaLabs, Ltd.
+ * All Rights Reserved.
+ */
+
+#include <uapi/misc/habanalabs.h>
+#include "habanalabs.h"
+
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+
+static void job_wq_completion(struct work_struct *work);
+static long _hl_cs_wait_ioctl(struct hl_device *hdev,
+ struct hl_ctx *ctx, u64 timeout_us, u64 seq);
+static void cs_do_release(struct kref *ref);
+
+static const char *hl_fence_get_driver_name(struct dma_fence *fence)
+{
+ return "HabanaLabs";
+}
+
+static const char *hl_fence_get_timeline_name(struct dma_fence *fence)
+{
+ struct hl_dma_fence *hl_fence =
+ container_of(fence, struct hl_dma_fence, base_fence);
+
+ return dev_name(hl_fence->hdev->dev);
+}
+
+static bool hl_fence_enable_signaling(struct dma_fence *fence)
+{
+ return true;
+}
+
+static void hl_fence_release(struct dma_fence *fence)
+{
+ struct hl_dma_fence *hl_fence =
+ container_of(fence, struct hl_dma_fence, base_fence);
+
+ kfree_rcu(hl_fence, base_fence.rcu);
+}
+
+static const struct dma_fence_ops hl_fence_ops = {
+ .get_driver_name = hl_fence_get_driver_name,
+ .get_timeline_name = hl_fence_get_timeline_name,
+ .enable_signaling = hl_fence_enable_signaling,
+ .wait = dma_fence_default_wait,
+ .release = hl_fence_release
+};
+
+static void cs_get(struct hl_cs *cs)
+{
+ kref_get(&cs->refcount);
+}
+
+static int cs_get_unless_zero(struct hl_cs *cs)
+{
+ return kref_get_unless_zero(&cs->refcount);
+}
+
+static void cs_put(struct hl_cs *cs)
+{
+ kref_put(&cs->refcount, cs_do_release);
+}
+
+/*
+ * cs_parser - parse the user command submission
+ *
+ * @hpriv : pointer to the private data of the fd
+ * @job : pointer to the job that holds the command submission info
+ *
+ * The function parses the command submission of the user. It calls the
+ * ASIC specific parser, which returns a list of memory blocks to send
+ * to the device as different command buffers
+ *
+ */
+static int cs_parser(struct hl_fpriv *hpriv, struct hl_cs_job *job)
+{
+ struct hl_device *hdev = hpriv->hdev;
+ struct hl_cs_parser parser;
+ int rc;
+
+ parser.ctx_id = job->cs->ctx->asid;
+ parser.cs_sequence = job->cs->sequence;
+ parser.job_id = job->id;
+
+ parser.hw_queue_id = job->hw_queue_id;
+ parser.job_userptr_list = &job->userptr_list;
+ parser.patched_cb = NULL;
+ parser.user_cb = job->user_cb;
+ parser.user_cb_size = job->user_cb_size;
+ parser.ext_queue = job->ext_queue;
+ job->patched_cb = NULL;
+ parser.use_virt_addr = hdev->mmu_enable;
+
+ rc = hdev->asic_funcs->cs_parser(hdev, &parser);
+ if (job->ext_queue) {
+ if (!rc) {
+ job->patched_cb = parser.patched_cb;
+ job->job_cb_size = parser.patched_cb_size;
+
+ spin_lock(&job->patched_cb->lock);
+ job->patched_cb->cs_cnt++;
+ spin_unlock(&job->patched_cb->lock);
+ }
+
+ /*
+ * Whether the parsing worked or not, we don't need the
+ * original CB anymore because it was already parsed and
+ * won't be accessed again for this CS
+ */
+ spin_lock(&job->user_cb->lock);
+ job->user_cb->cs_cnt--;
+ spin_unlock(&job->user_cb->lock);
+ hl_cb_put(job->user_cb);
+ job->user_cb = NULL;
+ }
+
+ return rc;
+}
+
+static void free_job(struct hl_device *hdev, struct hl_cs_job *job)
+{
+ struct hl_cs *cs = job->cs;
+
+ if (job->ext_queue) {
+ hl_userptr_delete_list(hdev, &job->userptr_list);
+
+ /*
+ * We might arrive here from rollback and patched CB wasn't
+ * created, so we need to check it's not NULL
+ */
+ if (job->patched_cb) {
+ spin_lock(&job->patched_cb->lock);
+ job->patched_cb->cs_cnt--;
+ spin_unlock(&job->patched_cb->lock);
+
+ hl_cb_put(job->patched_cb);
+ }
+ }
+
+ /*
+ * This is the only place where there can be multiple threads
+ * modifying the list at the same time
+ */
+ spin_lock(&cs->job_lock);
+ list_del(&job->cs_node);
+ spin_unlock(&cs->job_lock);
+
+ if (job->ext_queue)
+ cs_put(cs);
+
+ kfree(job);
+}
+
+static void cs_do_release(struct kref *ref)
+{
+ struct hl_cs *cs = container_of(ref, struct hl_cs,
+ refcount);
+ struct hl_device *hdev = cs->ctx->hdev;
+ struct hl_cs_job *job, *tmp;
+
+ cs->completed = true;
+
+ /*
+ * Although if we reached here it means that all external jobs have
+ * finished, because each one of them took refcnt to CS, we still
+ * need to go over the internal jobs and free them. Otherwise, we
+ * will have leaked memory and what's worse, the CS object (and
+ * potentially the CTX object) could be released, while the JOB
+ * still holds a pointer to them (but no reference).
+ */
+ list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node)
+ free_job(hdev, job);
+
+ /* We also need to update CI for internal queues */
+ if (cs->submitted) {
+ hl_int_hw_queue_update_ci(cs);
+
+ spin_lock(&hdev->hw_queues_mirror_lock);
+ /* remove CS from hw_queues mirror list */
+ list_del_init(&cs->mirror_node);
+ spin_unlock(&hdev->hw_queues_mirror_lock);
+
+ /*
+ * Don't cancel TDR in case this CS was timedout because we
+ * might be running from the TDR context
+ */
+ if ((!cs->timedout) &&
+ (hdev->timeout_jiffies != MAX_SCHEDULE_TIMEOUT)) {
+ struct hl_cs *next;
+
+ if (cs->tdr_active)
+ cancel_delayed_work_sync(&cs->work_tdr);
+
+ spin_lock(&hdev->hw_queues_mirror_lock);
+
+ /* queue TDR for next CS */
+ next = list_first_entry_or_null(
+ &hdev->hw_queues_mirror_list,
+ struct hl_cs, mirror_node);
+
+ if ((next) && (!next->tdr_active)) {
+ next->tdr_active = true;
+ schedule_delayed_work(&next->work_tdr,
+ hdev->timeout_jiffies);
+ }
+
+ spin_unlock(&hdev->hw_queues_mirror_lock);
+ }
+ }
+
+ hl_ctx_put(cs->ctx);
+
+ if (cs->timedout)
+ dma_fence_set_error(cs->fence, -ETIMEDOUT);
+ else if (cs->aborted)
+ dma_fence_set_error(cs->fence, -EIO);
+
+ dma_fence_signal(cs->fence);
+ dma_fence_put(cs->fence);
+
+ kfree(cs);
+}
+
+static void cs_timedout(struct work_struct *work)
+{
+ struct hl_device *hdev;
+ int ctx_asid, rc;
+ struct hl_cs *cs = container_of(work, struct hl_cs,
+ work_tdr.work);
+ rc = cs_get_unless_zero(cs);
+ if (!rc)
+ return;
+
+ if ((!cs->submitted) || (cs->completed)) {
+ cs_put(cs);
+ return;
+ }
+
+ /* Mark the CS is timed out so we won't try to cancel its TDR */
+ cs->timedout = true;
+
+ hdev = cs->ctx->hdev;
+ ctx_asid = cs->ctx->asid;
+
+ /* TODO: add information about last signaled seq and last emitted seq */
+ dev_err(hdev->dev, "CS %d.%llu got stuck!\n", ctx_asid, cs->sequence);
+
+ cs_put(cs);
+
+ if (hdev->reset_on_lockup)
+ hl_device_reset(hdev, false, false);
+}
+
+static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
+ struct hl_cs **cs_new)
+{
+ struct hl_dma_fence *fence;
+ struct dma_fence *other = NULL;
+ struct hl_cs *cs;
+ int rc;
+
+ cs = kzalloc(sizeof(*cs), GFP_ATOMIC);
+ if (!cs)
+ return -ENOMEM;
+
+ cs->ctx = ctx;
+ cs->submitted = false;
+ cs->completed = false;
+ INIT_LIST_HEAD(&cs->job_list);
+ INIT_DELAYED_WORK(&cs->work_tdr, cs_timedout);
+ kref_init(&cs->refcount);
+ spin_lock_init(&cs->job_lock);
+
+ fence = kmalloc(sizeof(*fence), GFP_ATOMIC);
+ if (!fence) {
+ rc = -ENOMEM;
+ goto free_cs;
+ }
+
+ fence->hdev = hdev;
+ spin_lock_init(&fence->lock);
+ cs->fence = &fence->base_fence;
+
+ spin_lock(&ctx->cs_lock);
+
+ fence->cs_seq = ctx->cs_sequence;
+ other = ctx->cs_pending[fence->cs_seq & (HL_MAX_PENDING_CS - 1)];
+ if ((other) && (!dma_fence_is_signaled(other))) {
+ spin_unlock(&ctx->cs_lock);
+ rc = -EAGAIN;
+ goto free_fence;
+ }
+
+ dma_fence_init(&fence->base_fence, &hl_fence_ops, &fence->lock,
+ ctx->asid, ctx->cs_sequence);
+
+ cs->sequence = fence->cs_seq;
+
+ ctx->cs_pending[fence->cs_seq & (HL_MAX_PENDING_CS - 1)] =
+ &fence->base_fence;
+ ctx->cs_sequence++;
+
+ dma_fence_get(&fence->base_fence);
+
+ dma_fence_put(other);
+
+ spin_unlock(&ctx->cs_lock);
+
+ *cs_new = cs;
+
+ return 0;
+
+free_fence:
+ kfree(fence);
+free_cs:
+ kfree(cs);
+ return rc;
+}
+
+static void cs_rollback(struct hl_device *hdev, struct hl_cs *cs)
+{
+ struct hl_cs_job *job, *tmp;
+
+ list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node)
+ free_job(hdev, job);
+}
+
+void hl_cs_rollback_all(struct hl_device *hdev)
+{
+ struct hl_cs *cs, *tmp;
+
+ /* flush all completions */
+ flush_workqueue(hdev->cq_wq);
+
+ /* Make sure we don't have leftovers in the H/W queues mirror list */
+ list_for_each_entry_safe(cs, tmp, &hdev->hw_queues_mirror_list,
+ mirror_node) {
+ cs_get(cs);
+ cs->aborted = true;
+ dev_warn_ratelimited(hdev->dev, "Killing CS %d.%llu\n",
+ cs->ctx->asid, cs->sequence);
+ cs_rollback(hdev, cs);
+ cs_put(cs);
+ }
+}
+
+static void job_wq_completion(struct work_struct *work)
+{
+ struct hl_cs_job *job = container_of(work, struct hl_cs_job,
+ finish_work);
+ struct hl_cs *cs = job->cs;
+ struct hl_device *hdev = cs->ctx->hdev;
+
+ /* job is no longer needed */
+ free_job(hdev, job);
+}
+
+static struct hl_cb *validate_queue_index(struct hl_device *hdev,
+ struct hl_cb_mgr *cb_mgr,
+ struct hl_cs_chunk *chunk,
+ bool *ext_queue)
+{
+ struct asic_fixed_properties *asic = &hdev->asic_prop;
+ struct hw_queue_properties *hw_queue_prop;
+ u32 cb_handle;
+ struct hl_cb *cb;
+
+ /* Assume external queue */
+ *ext_queue = true;
+
+ hw_queue_prop = &asic->hw_queues_props[chunk->queue_index];
+
+ if ((chunk->queue_index >= HL_MAX_QUEUES) ||
+ (hw_queue_prop->type == QUEUE_TYPE_NA)) {
+ dev_err(hdev->dev, "Queue index %d is invalid\n",
+ chunk->queue_index);
+ return NULL;
+ }
+
+ if (hw_queue_prop->kmd_only) {
+ dev_err(hdev->dev, "Queue index %d is restricted for KMD\n",
+ chunk->queue_index);
+ return NULL;
+ } else if (hw_queue_prop->type == QUEUE_TYPE_INT) {
+ *ext_queue = false;
+ return (struct hl_cb *) (uintptr_t) chunk->cb_handle;
+ }
+
+ /* Retrieve CB object */
+ cb_handle = (u32) (chunk->cb_handle >> PAGE_SHIFT);
+
+ cb = hl_cb_get(hdev, cb_mgr, cb_handle);
+ if (!cb) {
+ dev_err(hdev->dev, "CB handle 0x%x invalid\n", cb_handle);
+ return NULL;
+ }
+
+ if ((chunk->cb_size < 8) || (chunk->cb_size > cb->size)) {
+ dev_err(hdev->dev, "CB size %u invalid\n", chunk->cb_size);
+ goto release_cb;
+ }
+
+ spin_lock(&cb->lock);
+ cb->cs_cnt++;
+ spin_unlock(&cb->lock);
+
+ return cb;
+
+release_cb:
+ hl_cb_put(cb);
+ return NULL;
+}
+
+struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev, bool ext_queue)
+{
+ struct hl_cs_job *job;
+
+ job = kzalloc(sizeof(*job), GFP_ATOMIC);
+ if (!job)
+ return NULL;
+
+ job->ext_queue = ext_queue;
+
+ if (job->ext_queue) {
+ INIT_LIST_HEAD(&job->userptr_list);
+ INIT_WORK(&job->finish_work, job_wq_completion);
+ }
+
+ return job;
+}
+
+static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
+ u32 num_chunks, u64 *cs_seq)
+{
+ struct hl_device *hdev = hpriv->hdev;
+ struct hl_cs_chunk *cs_chunk_array;
+ struct hl_cs_job *job;
+ struct hl_cs *cs;
+ struct hl_cb *cb;
+ bool ext_queue_present = false;
+ u32 size_to_copy;
+ int rc, i, parse_cnt;
+
+ *cs_seq = ULLONG_MAX;
+
+ if (num_chunks > HL_MAX_JOBS_PER_CS) {
+ dev_err(hdev->dev,
+ "Number of chunks can NOT be larger than %d\n",
+ HL_MAX_JOBS_PER_CS);
+ rc = -EINVAL;
+ goto out;
+ }
+
+ cs_chunk_array = kmalloc_array(num_chunks, sizeof(*cs_chunk_array),
+ GFP_ATOMIC);
+ if (!cs_chunk_array) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ size_to_copy = num_chunks * sizeof(struct hl_cs_chunk);
+ if (copy_from_user(cs_chunk_array, chunks, size_to_copy)) {
+ dev_err(hdev->dev, "Failed to copy cs chunk array from user\n");
+ rc = -EFAULT;
+ goto free_cs_chunk_array;
+ }
+
+ /* increment refcnt for context */
+ hl_ctx_get(hdev, hpriv->ctx);
+
+ rc = allocate_cs(hdev, hpriv->ctx, &cs);
+ if (rc) {
+ hl_ctx_put(hpriv->ctx);
+ goto free_cs_chunk_array;
+ }
+
+ *cs_seq = cs->sequence;
+
+ /* Validate ALL the CS chunks before submitting the CS */
+ for (i = 0, parse_cnt = 0 ; i < num_chunks ; i++, parse_cnt++) {
+ struct hl_cs_chunk *chunk = &cs_chunk_array[i];
+ bool ext_queue;
+
+ cb = validate_queue_index(hdev, &hpriv->cb_mgr, chunk,
+ &ext_queue);
+ if (ext_queue) {
+ ext_queue_present = true;
+ if (!cb) {
+ rc = -EINVAL;
+ goto free_cs_object;
+ }
+ }
+
+ job = hl_cs_allocate_job(hdev, ext_queue);
+ if (!job) {
+ dev_err(hdev->dev, "Failed to allocate a new job\n");
+ rc = -ENOMEM;
+ if (ext_queue)
+ goto release_cb;
+ else
+ goto free_cs_object;
+ }
+
+ job->id = i + 1;
+ job->cs = cs;
+ job->user_cb = cb;
+ job->user_cb_size = chunk->cb_size;
+ if (job->ext_queue)
+ job->job_cb_size = cb->size;
+ else
+ job->job_cb_size = chunk->cb_size;
+ job->hw_queue_id = chunk->queue_index;
+
+ cs->jobs_in_queue_cnt[job->hw_queue_id]++;
+
+ list_add_tail(&job->cs_node, &cs->job_list);
+
+ /*
+ * Increment CS reference. When CS reference is 0, CS is
+ * done and can be signaled to user and free all its resources
+ * Only increment for JOB on external queues, because only
+ * for those JOBs we get completion
+ */
+ if (job->ext_queue)
+ cs_get(cs);
+
+ rc = cs_parser(hpriv, job);
+ if (rc) {
+ dev_err(hdev->dev,
+ "Failed to parse JOB %d.%llu.%d, err %d, rejecting the CS\n",
+ cs->ctx->asid, cs->sequence, job->id, rc);
+ goto free_cs_object;
+ }
+ }
+
+ if (!ext_queue_present) {
+ dev_err(hdev->dev,
+ "Reject CS %d.%llu because no external queues jobs\n",
+ cs->ctx->asid, cs->sequence);
+ rc = -EINVAL;
+ goto free_cs_object;
+ }
+
+ rc = hl_hw_queue_schedule_cs(cs);
+ if (rc) {
+ dev_err(hdev->dev,
+ "Failed to submit CS %d.%llu to H/W queues, error %d\n",
+ cs->ctx->asid, cs->sequence, rc);
+ goto free_cs_object;
+ }
+
+ rc = HL_CS_STATUS_SUCCESS;
+ goto put_cs;
+
+release_cb:
+ spin_lock(&cb->lock);
+ cb->cs_cnt--;
+ spin_unlock(&cb->lock);
+ hl_cb_put(cb);
+free_cs_object:
+ cs_rollback(hdev, cs);
+ *cs_seq = ULLONG_MAX;
+ /* The path below is both for good and erroneous exits */
+put_cs:
+ /* We finished with the CS in this function, so put the ref */
+ cs_put(cs);
+free_cs_chunk_array:
+ kfree(cs_chunk_array);
+out:
+ return rc;
+}
+
+int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
+{
+ struct hl_device *hdev = hpriv->hdev;
+ union hl_cs_args *args = data;
+ struct hl_ctx *ctx = hpriv->ctx;
+ void __user *chunks;
+ u32 num_chunks;
+ u64 cs_seq = ULONG_MAX;
+ int rc, do_restore;
+ bool need_soft_reset = false;
+
+ if (hl_device_disabled_or_in_reset(hdev)) {
+ dev_warn(hdev->dev,
+ "Device is %s. Can't submit new CS\n",
+ atomic_read(&hdev->in_reset) ? "in_reset" : "disabled");
+ rc = -EBUSY;
+ goto out;
+ }
+
+ do_restore = atomic_cmpxchg(&ctx->thread_restore_token, 1, 0);
+
+ if (do_restore || (args->in.cs_flags & HL_CS_FLAGS_FORCE_RESTORE)) {
+ long ret;
+
+ chunks = (void __user *)(uintptr_t)args->in.chunks_restore;
+ num_chunks = args->in.num_chunks_restore;
+
+ mutex_lock(&hpriv->restore_phase_mutex);
+
+ if (do_restore) {
+ rc = hdev->asic_funcs->context_switch(hdev, ctx->asid);
+ if (rc) {
+ dev_err_ratelimited(hdev->dev,
+ "Failed to switch to context %d, rejecting CS! %d\n",
+ ctx->asid, rc);
+ /*
+ * If we timedout, we need to soft-reset because
+ * QMAN is probably stuck. However, we can't
+ * call to reset here directly because of
+ * deadlock, so need to do it at the very end
+ * of this function
+ */
+ if (rc == -ETIMEDOUT)
+ need_soft_reset = true;
+ mutex_unlock(&hpriv->restore_phase_mutex);
+ goto out;
+ }
+ }
+
+ hdev->asic_funcs->restore_phase_topology(hdev);
+
+ if (num_chunks == 0) {
+ dev_dbg(hdev->dev,
+ "Need to run restore phase but restore CS is empty\n");
+ rc = 0;
+ } else {
+ rc = _hl_cs_ioctl(hpriv, chunks, num_chunks,
+ &cs_seq);
+ }
+
+ mutex_unlock(&hpriv->restore_phase_mutex);
+
+ if (rc) {
+ dev_err(hdev->dev,
+ "Failed to submit restore CS for context %d (%d)\n",
+ ctx->asid, rc);
+ goto out;
+ }
+
+ /* Need to wait for restore completion before execution phase */
+ if (num_chunks > 0) {
+ ret = _hl_cs_wait_ioctl(hdev, ctx,
+ jiffies_to_usecs(hdev->timeout_jiffies),
+ cs_seq);
+ if (ret <= 0) {
+ dev_err(hdev->dev,
+ "Restore CS for context %d failed to complete %ld\n",
+ ctx->asid, ret);
+ rc = -ENOEXEC;
+ goto out;
+ }
+ }
+
+ ctx->thread_restore_wait_token = 1;
+ } else if (!ctx->thread_restore_wait_token) {
+ u32 tmp;
+
+ rc = hl_poll_timeout_memory(hdev,
+ (u64) (uintptr_t) &ctx->thread_restore_wait_token,
+ jiffies_to_usecs(hdev->timeout_jiffies),
+ &tmp);
+
+ if (rc || !tmp) {
+ dev_err(hdev->dev,
+ "restore phase hasn't finished in time\n");
+ rc = -ETIMEDOUT;
+ goto out;
+ }
+ }
+
+ chunks = (void __user *)(uintptr_t)args->in.chunks_execute;
+ num_chunks = args->in.num_chunks_execute;
+
+ if (num_chunks == 0) {
+ dev_err(hdev->dev,
+ "Got execute CS with 0 chunks, context %d\n",
+ ctx->asid);
+ rc = -EINVAL;
+ goto out;
+ }
+
+ rc = _hl_cs_ioctl(hpriv, chunks, num_chunks, &cs_seq);
+
+out:
+ if (rc != -EAGAIN) {
+ memset(args, 0, sizeof(*args));
+ args->out.status = rc;
+ args->out.seq = cs_seq;
+ }
+
+ if ((rc == -ETIMEDOUT) && (need_soft_reset))
+ hl_device_reset(hdev, false, false);
+
+ return rc;
+}
+
+static long _hl_cs_wait_ioctl(struct hl_device *hdev,
+ struct hl_ctx *ctx, u64 timeout_us, u64 seq)
+{
+ struct dma_fence *fence;
+ unsigned long timeout;
+ long rc;
+
+ if (timeout_us == MAX_SCHEDULE_TIMEOUT)
+ timeout = timeout_us;
+ else
+ timeout = usecs_to_jiffies(timeout_us);
+
+ hl_ctx_get(hdev, ctx);
+
+ fence = hl_ctx_get_fence(ctx, seq);
+ if (IS_ERR(fence)) {
+ rc = PTR_ERR(fence);
+ } else if (fence) {
+ rc = dma_fence_wait_timeout(fence, true, timeout);
+ if (fence->error == -ETIMEDOUT)
+ rc = -ETIMEDOUT;
+ else if (fence->error == -EIO)
+ rc = -EIO;
+ dma_fence_put(fence);
+ } else
+ rc = 1;
+
+ hl_ctx_put(ctx);
+
+ return rc;
+}
+
+int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data)
+{
+ struct hl_device *hdev = hpriv->hdev;
+ union hl_wait_cs_args *args = data;
+ u64 seq = args->in.seq;
+ long rc;
+
+ rc = _hl_cs_wait_ioctl(hdev, hpriv->ctx, args->in.timeout_us, seq);
+
+ memset(args, 0, sizeof(*args));
+
+ if (rc < 0) {
+ dev_err(hdev->dev, "Error %ld on waiting for CS handle %llu\n",
+ rc, seq);
+ if (rc == -ERESTARTSYS) {
+ args->out.status = HL_WAIT_CS_STATUS_INTERRUPTED;
+ rc = -EINTR;
+ } else if (rc == -ETIMEDOUT) {
+ args->out.status = HL_WAIT_CS_STATUS_TIMEDOUT;
+ } else if (rc == -EIO) {
+ args->out.status = HL_WAIT_CS_STATUS_ABORTED;
+ }
+ return rc;
+ }
+
+ if (rc == 0)
+ args->out.status = HL_WAIT_CS_STATUS_BUSY;
+ else
+ args->out.status = HL_WAIT_CS_STATUS_COMPLETED;
+
+ return 0;
+}
diff --git a/drivers/misc/habanalabs/context.c b/drivers/misc/habanalabs/context.c
index de1258e7a6e6..c3854714b46c 100644
--- a/drivers/misc/habanalabs/context.c
+++ b/drivers/misc/habanalabs/context.c
@@ -12,6 +12,18 @@
static void hl_ctx_fini(struct hl_ctx *ctx)
{
struct hl_device *hdev = ctx->hdev;
+ int i;
+
+ /*
+ * If we arrived here, there are no jobs waiting for this context
+ * on its queues so we can safely remove it.
+ * This is because for each CS, we increment the ref count and for
+ * every CS that was finished we decrement it and we won't arrive
+ * to this function unless the ref count is 0
+ */
+
+ for (i = 0 ; i < HL_MAX_PENDING_CS ; i++)
+ dma_fence_put(ctx->cs_pending[i]);
if (ctx->asid != HL_KERNEL_ASID_ID)
hl_asid_free(hdev, ctx->asid);
@@ -23,8 +35,6 @@ void hl_ctx_do_release(struct kref *ref)
ctx = container_of(ref, struct hl_ctx, refcount);
- dev_dbg(ctx->hdev->dev, "Now really releasing context %d\n", ctx->asid);
-
hl_ctx_fini(ctx);
if (ctx->hpriv)
@@ -90,6 +100,11 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)
kref_init(&ctx->refcount);
+ ctx->cs_sequence = 1;
+ spin_lock_init(&ctx->cs_lock);
+ atomic_set(&ctx->thread_restore_token, 1);
+ ctx->thread_restore_wait_token = 0;
+
if (is_kernel_ctx) {
ctx->asid = HL_KERNEL_ASID_ID; /* KMD gets ASID 0 */
} else {
@@ -100,8 +115,6 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)
}
}
- dev_dbg(hdev->dev, "Created context with ASID %u\n", ctx->asid);
-
return 0;
}
@@ -115,6 +128,37 @@ int hl_ctx_put(struct hl_ctx *ctx)
return kref_put(&ctx->refcount, hl_ctx_do_release);
}
+struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
+{
+ struct hl_device *hdev = ctx->hdev;
+ struct dma_fence *fence;
+
+ spin_lock(&ctx->cs_lock);
+
+ if (seq >= ctx->cs_sequence) {
+ dev_notice(hdev->dev,
+ "Can't wait on seq %llu because current CS is at seq %llu\n",
+ seq, ctx->cs_sequence);
+ spin_unlock(&ctx->cs_lock);
+ return ERR_PTR(-EINVAL);
+ }
+
+
+ if (seq + HL_MAX_PENDING_CS < ctx->cs_sequence) {
+ dev_dbg(hdev->dev,
+ "Can't wait on seq %llu because current CS is at seq %llu (Fence is gone)\n",
+ seq, ctx->cs_sequence);
+ spin_unlock(&ctx->cs_lock);
+ return NULL;
+ }
+
+ fence = dma_fence_get(
+ ctx->cs_pending[seq & (HL_MAX_PENDING_CS - 1)]);
+ spin_unlock(&ctx->cs_lock);
+
+ return fence;
+}
+
/*
* hl_ctx_mgr_init - initialize the context manager
*
diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c
index 2aa8a68cdf76..cc5f068df597 100644
--- a/drivers/misc/habanalabs/device.c
+++ b/drivers/misc/habanalabs/device.c
@@ -30,6 +30,8 @@ static void hpriv_release(struct kref *ref)
put_pid(hpriv->taskpid);
+ mutex_destroy(&hpriv->restore_phase_mutex);
+
kfree(hpriv);
/* Now the FD is really closed */
@@ -208,6 +210,8 @@ static int device_early_init(struct hl_device *hdev)
mutex_init(&hdev->fd_open_cnt_lock);
mutex_init(&hdev->send_cpu_message_lock);
+ INIT_LIST_HEAD(&hdev->hw_queues_mirror_list);
+ spin_lock_init(&hdev->hw_queues_mirror_lock);
atomic_set(&hdev->in_reset, 0);
atomic_set(&hdev->fd_open_cnt, 0);
@@ -593,6 +597,9 @@ again:
*/
hdev->asic_funcs->halt_engines(hdev, hard_reset);
+ /* Go over all the queues, release all CS and their jobs */
+ hl_cs_rollback_all(hdev);
+
if (hard_reset) {
/* Release kernel context */
if (hl_ctx_put(hdev->kernel_ctx) != 1) {
@@ -616,6 +623,12 @@ again:
for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
hl_cq_reset(hdev, &hdev->completion_queue[i]);
+ /* Make sure the setup phase for the user context will run again */
+ if (hdev->user_ctx) {
+ atomic_set(&hdev->user_ctx->thread_restore_token, 1);
+ hdev->user_ctx->thread_restore_wait_token = 0;
+ }
+
/* Finished tear-down, starting to re-initialize */
if (hard_reset) {
@@ -952,6 +965,9 @@ void hl_device_fini(struct hl_device *hdev)
*/
hdev->asic_funcs->halt_engines(hdev, true);
+ /* Go over all the queues, release all CS and their jobs */
+ hl_cs_rollback_all(hdev);
+
hl_cb_pool_fini(hdev);
/* Release kernel context */
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 1fe1d6a1ff9e..e3878fd7dc94 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -95,6 +95,19 @@ static const char goya_irq_name[GOYA_MSIX_ENTRIES][GOYA_MAX_STRING_LEN] = {
"goya cq 4", "goya cpu eq"
};
+static u16 goya_packet_sizes[MAX_PACKET_ID] = {
+ [PACKET_WREG_32] = sizeof(struct packet_wreg32),
+ [PACKET_WREG_BULK] = sizeof(struct packet_wreg_bulk),
+ [PACKET_MSG_LONG] = sizeof(struct packet_msg_long),
+ [PACKET_MSG_SHORT] = sizeof(struct packet_msg_short),
+ [PACKET_CP_DMA] = sizeof(struct packet_cp_dma),
+ [PACKET_MSG_PROT] = sizeof(struct packet_msg_prot),
+ [PACKET_FENCE] = sizeof(struct packet_fence),
+ [PACKET_LIN_DMA] = sizeof(struct packet_lin_dma),
+ [PACKET_NOP] = sizeof(struct packet_nop),
+ [PACKET_STOP] = sizeof(struct packet_stop)
+};
+
static const char *goya_axi_name[GOYA_MAX_INITIATORS] = {
"MME0",
"MME1",
@@ -2978,6 +2991,84 @@ void *goya_get_int_queue_base(struct hl_device *hdev, u32 queue_id,
return base;
}
+int goya_send_job_on_qman0(struct hl_device *hdev, struct hl_cs_job *job)
+{
+ struct goya_device *goya = hdev->asic_specific;
+ struct packet_msg_prot *fence_pkt;
+ u32 *fence_ptr;
+ dma_addr_t fence_dma_addr;
+ struct hl_cb *cb;
+ u32 tmp;
+ int rc;
+
+ if (!hdev->asic_funcs->is_device_idle(hdev)) {
+ dev_err_ratelimited(hdev->dev,
+ "Can't send KMD job on QMAN0 if device is not idle\n");
+ return -EFAULT;
+ }
+
+ fence_ptr = hdev->asic_funcs->dma_pool_zalloc(hdev, 4, GFP_KERNEL,
+ &fence_dma_addr);
+ if (!fence_ptr) {
+ dev_err(hdev->dev,
+ "Failed to allocate fence memory for QMAN0\n");
+ return -ENOMEM;
+ }
+
+ *fence_ptr = 0;
+
+ if (goya->hw_cap_initialized & HW_CAP_MMU) {
+ WREG32(mmDMA_QM_0_GLBL_PROT, QMAN_DMA_FULLY_TRUSTED);
+ RREG32(mmDMA_QM_0_GLBL_PROT);
+ }
+
+ /*
+ * goya cs parser saves space for 2xpacket_msg_prot at end of CB. For
+ * synchronized kernel jobs we only need space for 1 packet_msg_prot
+ */
+ job->job_cb_size -= sizeof(struct packet_msg_prot);
+
+ cb = job->patched_cb;
+
+ fence_pkt = (struct packet_msg_prot *) (uintptr_t) (cb->kernel_address +
+ job->job_cb_size - sizeof(struct packet_msg_prot));
+
+ fence_pkt->ctl = (PACKET_MSG_PROT << GOYA_PKT_CTL_OPCODE_SHIFT) |
+ (1 << GOYA_PKT_CTL_EB_SHIFT) |
+ (1 << GOYA_PKT_CTL_MB_SHIFT);
+ fence_pkt->value = GOYA_QMAN0_FENCE_VAL;
+ fence_pkt->addr = fence_dma_addr +
+ hdev->asic_prop.host_phys_base_address;
+
+ rc = hl_hw_queue_send_cb_no_cmpl(hdev, GOYA_QUEUE_ID_DMA_0,
+ job->job_cb_size, cb->bus_address);
+ if (rc) {
+ dev_err(hdev->dev, "Failed to send CB on QMAN0, %d\n", rc);
+ goto free_fence_ptr;
+ }
+
+ rc = hl_poll_timeout_memory(hdev, (u64) (uintptr_t) fence_ptr,
+ HL_DEVICE_TIMEOUT_USEC, &tmp);
+
+ hl_hw_queue_inc_ci_kernel(hdev, GOYA_QUEUE_ID_DMA_0);
+
+ if ((rc) || (tmp != GOYA_QMAN0_FENCE_VAL)) {
+ dev_err(hdev->dev, "QMAN0 Job hasn't finished in time\n");
+ rc = -ETIMEDOUT;
+ }
+
+free_fence_ptr:
+ hdev->asic_funcs->dma_pool_free(hdev, (void *) fence_ptr,
+ fence_dma_addr);
+
+ if (goya->hw_cap_initialized & HW_CAP_MMU) {
+ WREG32(mmDMA_QM_0_GLBL_PROT, QMAN_DMA_PARTLY_TRUSTED);
+ RREG32(mmDMA_QM_0_GLBL_PROT);
+ }
+
+ return rc;
+}
+
int goya_send_cpu_message(struct hl_device *hdev, u32 *msg, u16 len,
u32 timeout, long *result)
{
@@ -3214,11 +3305,985 @@ void goya_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size,
size);
}
+int goya_dma_map_sg(struct hl_device *hdev, struct scatterlist *sg, int nents,
+ enum dma_data_direction dir)
+{
+ if (!dma_map_sg(&hdev->pdev->dev, sg, nents, dir))
+ return -ENOMEM;
+
+ return 0;
+}
+
+void goya_dma_unmap_sg(struct hl_device *hdev, struct scatterlist *sg,
+ int nents, enum dma_data_direction dir)
+{
+ dma_unmap_sg(&hdev->pdev->dev, sg, nents, dir);
+}
+
+u32 goya_get_dma_desc_list_size(struct hl_device *hdev,
+ struct sg_table *sgt)
+{
+ struct scatterlist *sg, *sg_next_iter;
+ u32 count, len, dma_desc_cnt, len_next;
+ dma_addr_t addr, addr_next;
+
+ dma_desc_cnt = 0;
+
+ for_each_sg(sgt->sgl, sg, sgt->nents, count) {
+
+ len = sg_dma_len(sg);
+ addr = sg_dma_address(sg);
+
+ if (len == 0)
+ break;
+
+ while ((count + 1) < sgt->nents) {
+ sg_next_iter = sg_next(sg);
+ len_next = sg_dma_len(sg_next_iter);
+ addr_next = sg_dma_address(sg_next_iter);
+
+ if (len_next == 0)
+ break;
+
+ if ((addr + len == addr_next) &&
+ (len + len_next <= DMA_MAX_TRANSFER_SIZE)) {
+ len += len_next;
+ count++;
+ sg = sg_next_iter;
+ } else {
+ break;
+ }
+ }
+
+ dma_desc_cnt++;
+ }
+
+ return dma_desc_cnt * sizeof(struct packet_lin_dma);
+}
+
+static int goya_pin_memory_before_cs(struct hl_device *hdev,
+ struct hl_cs_parser *parser,
+ struct packet_lin_dma *user_dma_pkt,
+ u64 addr, enum dma_data_direction dir)
+{
+ struct hl_userptr *userptr;
+ int rc;
+
+ if (hl_userptr_is_pinned(hdev, addr, user_dma_pkt->tsize,
+ parser->job_userptr_list, &userptr))
+ goto already_pinned;
+
+ userptr = kzalloc(sizeof(*userptr), GFP_ATOMIC);
+ if (!userptr)
+ return -ENOMEM;
+
+ rc = hl_pin_host_memory(hdev, addr, user_dma_pkt->tsize, userptr);
+ if (rc)
+ goto free_userptr;
+
+ list_add_tail(&userptr->job_node, parser->job_userptr_list);
+
+ rc = hdev->asic_funcs->asic_dma_map_sg(hdev, userptr->sgt->sgl,
+ userptr->sgt->nents, dir);
+ if (rc) {
+ dev_err(hdev->dev, "failed to map sgt with DMA region\n");
+ goto unpin_memory;
+ }
+
+ userptr->dma_mapped = true;
+ userptr->dir = dir;
+
+already_pinned:
+ parser->patched_cb_size +=
+ goya_get_dma_desc_list_size(hdev, userptr->sgt);
+
+ return 0;
+
+unpin_memory:
+ hl_unpin_host_memory(hdev, userptr);
+free_userptr:
+ kfree(userptr);
+ return rc;
+}
+
+static int goya_validate_dma_pkt_host(struct hl_device *hdev,
+ struct hl_cs_parser *parser,
+ struct packet_lin_dma *user_dma_pkt)
+{
+ u64 device_memory_addr, addr;
+ enum dma_data_direction dir;
+ enum goya_dma_direction user_dir;
+ bool sram_addr = true;
+ bool skip_host_mem_pin = false;
+ bool user_memset;
+ int rc = 0;
+
+ user_dir = (user_dma_pkt->ctl & GOYA_PKT_LIN_DMA_CTL_DMA_DIR_MASK) >>
+ GOYA_PKT_LIN_DMA_CTL_DMA_DIR_SHIFT;
+
+ user_memset = (user_dma_pkt->ctl & GOYA_PKT_LIN_DMA_CTL_MEMSET_MASK) >>
+ GOYA_PKT_LIN_DMA_CTL_MEMSET_SHIFT;
+
+ switch (user_dir) {
+ case DMA_HOST_TO_DRAM:
+ dev_dbg(hdev->dev, "DMA direction is HOST --> DRAM\n");
+ dir = DMA_TO_DEVICE;
+ sram_addr = false;
+ addr = user_dma_pkt->src_addr;
+ device_memory_addr = user_dma_pkt->dst_addr;
+ if (user_memset)
+ skip_host_mem_pin = true;
+ break;
+
+ case DMA_DRAM_TO_HOST:
+ dev_dbg(hdev->dev, "DMA direction is DRAM --> HOST\n");
+ dir = DMA_FROM_DEVICE;
+ sram_addr = false;
+ addr = user_dma_pkt->dst_addr;
+ device_memory_addr = user_dma_pkt->src_addr;
+ break;
+
+ case DMA_HOST_TO_SRAM:
+ dev_dbg(hdev->dev, "DMA direction is HOST --> SRAM\n");
+ dir = DMA_TO_DEVICE;
+ addr = user_dma_pkt->src_addr;
+ device_memory_addr = user_dma_pkt->dst_addr;
+ if (user_memset)
+ skip_host_mem_pin = true;
+ break;
+
+ case DMA_SRAM_TO_HOST:
+ dev_dbg(hdev->dev, "DMA direction is SRAM --> HOST\n");
+ dir = DMA_FROM_DEVICE;
+ addr = user_dma_pkt->dst_addr;
+ device_memory_addr = user_dma_pkt->src_addr;
+ break;
+ default:
+ dev_err(hdev->dev, "DMA direction is undefined\n");
+ return -EFAULT;
+ }
+
+ if (parser->ctx_id != HL_KERNEL_ASID_ID) {
+ if (sram_addr) {
+ if (!hl_mem_area_inside_range(device_memory_addr,
+ user_dma_pkt->tsize,
+ hdev->asic_prop.sram_user_base_address,
+ hdev->asic_prop.sram_end_address)) {
+
+ dev_err(hdev->dev,
+ "SRAM address 0x%llx + 0x%x is invalid\n",
+ device_memory_addr,
+ user_dma_pkt->tsize);
+ return -EFAULT;
+ }
+ } else {
+ if (!hl_mem_area_inside_range(device_memory_addr,
+ user_dma_pkt->tsize,
+ hdev->asic_prop.dram_user_base_address,
+ hdev->asic_prop.dram_end_address)) {
+
+ dev_err(hdev->dev,
+ "DRAM address 0x%llx + 0x%x is invalid\n",
+ device_memory_addr,
+ user_dma_pkt->tsize);
+ return -EFAULT;
+ }
+ }
+ }
+
+ if (skip_host_mem_pin)
+ parser->patched_cb_size += sizeof(*user_dma_pkt);
+ else {
+ if ((dir == DMA_TO_DEVICE) &&
+ (parser->hw_queue_id > GOYA_QUEUE_ID_DMA_1)) {
+ dev_err(hdev->dev,
+ "Can't DMA from host on queue other then 1\n");
+ return -EFAULT;
+ }
+
+ rc = goya_pin_memory_before_cs(hdev, parser, user_dma_pkt,
+ addr, dir);
+ }
+
+ return rc;
+}
+
+static int goya_validate_dma_pkt_no_host(struct hl_device *hdev,
+ struct hl_cs_parser *parser,
+ struct packet_lin_dma *user_dma_pkt)
+{
+ u64 sram_memory_addr, dram_memory_addr;
+ enum goya_dma_direction user_dir;
+
+ user_dir = (user_dma_pkt->ctl & GOYA_PKT_LIN_DMA_CTL_DMA_DIR_MASK) >>
+ GOYA_PKT_LIN_DMA_CTL_DMA_DIR_SHIFT;
+
+ if (user_dir == DMA_DRAM_TO_SRAM) {
+ dev_dbg(hdev->dev, "DMA direction is DRAM --> SRAM\n");
+ dram_memory_addr = user_dma_pkt->src_addr;
+ sram_memory_addr = user_dma_pkt->dst_addr;
+ } else {
+ dev_dbg(hdev->dev, "DMA direction is SRAM --> DRAM\n");
+ sram_memory_addr = user_dma_pkt->src_addr;
+ dram_memory_addr = user_dma_pkt->dst_addr;
+ }
+
+ if (!hl_mem_area_inside_range(sram_memory_addr, user_dma_pkt->tsize,
+ hdev->asic_prop.sram_user_base_address,
+ hdev->asic_prop.sram_end_address)) {
+ dev_err(hdev->dev, "SRAM address 0x%llx + 0x%x is invalid\n",
+ sram_memory_addr, user_dma_pkt->tsize);
+ return -EFAULT;
+ }
+
+ if (!hl_mem_area_inside_range(dram_memory_addr, user_dma_pkt->tsize,
+ hdev->asic_prop.dram_user_base_address,
+ hdev->asic_prop.dram_end_address)) {
+ dev_err(hdev->dev, "DRAM address 0x%llx + 0x%x is invalid\n",
+ dram_memory_addr, user_dma_pkt->tsize);
+ return -EFAULT;
+ }
+
+ parser->patched_cb_size += sizeof(*user_dma_pkt);
+
+ return 0;
+}
+
+static int goya_validate_dma_pkt_no_mmu(struct hl_device *hdev,
+ struct hl_cs_parser *parser,
+ struct packet_lin_dma *user_dma_pkt)
+{
+ enum goya_dma_direction user_dir;
+ int rc;
+
+ dev_dbg(hdev->dev, "DMA packet details:\n");
+ dev_dbg(hdev->dev, "source == 0x%llx\n", user_dma_pkt->src_addr);
+ dev_dbg(hdev->dev, "destination == 0x%llx\n", user_dma_pkt->dst_addr);
+ dev_dbg(hdev->dev, "size == %u\n", user_dma_pkt->tsize);
+
+ user_dir = (user_dma_pkt->ctl & GOYA_PKT_LIN_DMA_CTL_DMA_DIR_MASK) >>
+ GOYA_PKT_LIN_DMA_CTL_DMA_DIR_SHIFT;
+
+ /*
+ * Special handling for DMA with size 0. The H/W has a bug where
+ * this can cause the QMAN DMA to get stuck, so block it here.
+ */
+ if (user_dma_pkt->tsize == 0) {
+ dev_err(hdev->dev,
+ "Got DMA with size 0, might reset the device\n");
+ return -EINVAL;
+ }
+
+ if ((user_dir == DMA_DRAM_TO_SRAM) || (user_dir == DMA_SRAM_TO_DRAM))
+ rc = goya_validate_dma_pkt_no_host(hdev, parser, user_dma_pkt);
+ else
+ rc = goya_validate_dma_pkt_host(hdev, parser, user_dma_pkt);
+
+ return rc;
+}
+
+static int goya_validate_dma_pkt_mmu(struct hl_device *hdev,
+ struct hl_cs_parser *parser,
+ struct packet_lin_dma *user_dma_pkt)
+{
+ dev_dbg(hdev->dev, "DMA packet details:\n");
+ dev_dbg(hdev->dev, "source == 0x%llx\n", user_dma_pkt->src_addr);
+ dev_dbg(hdev->dev, "destination == 0x%llx\n", user_dma_pkt->dst_addr);
+ dev_dbg(hdev->dev, "size == %u\n", user_dma_pkt->tsize);
+
+ /*
+ * WA for HW-23.
+ * We can't allow user to read from Host using QMANs other than 1.
+ */
+ if (parser->hw_queue_id > GOYA_QUEUE_ID_DMA_1 &&
+ hl_mem_area_inside_range(user_dma_pkt->src_addr,
+ user_dma_pkt->tsize,
+ hdev->asic_prop.va_space_host_start_address,
+ hdev->asic_prop.va_space_host_end_address)) {
+ dev_err(hdev->dev,
+ "Can't DMA from host on queue other then 1\n");
+ return -EFAULT;
+ }
+
+ if (user_dma_pkt->tsize == 0) {
+ dev_err(hdev->dev,
+ "Got DMA with size 0, might reset the device\n");
+ return -EINVAL;
+ }
+
+ parser->patched_cb_size += sizeof(*user_dma_pkt);
+
+ return 0;
+}
+
+static int goya_validate_wreg32(struct hl_device *hdev,
+ struct hl_cs_parser *parser,
+ struct packet_wreg32 *wreg_pkt)
+{
+ struct goya_device *goya = hdev->asic_specific;
+ u32 sob_start_addr, sob_end_addr;
+ u16 reg_offset;
+
+ reg_offset = wreg_pkt->ctl & GOYA_PKT_WREG32_CTL_REG_OFFSET_MASK;
+
+ dev_dbg(hdev->dev, "WREG32 packet details:\n");
+ dev_dbg(hdev->dev, "reg_offset == 0x%x\n", reg_offset);
+ dev_dbg(hdev->dev, "value == 0x%x\n", wreg_pkt->value);
+
+ if (reg_offset != (mmDMA_CH_1_WR_COMP_ADDR_LO & 0xFFFF)) {
+ dev_err(hdev->dev, "WREG32 packet with illegal address 0x%x\n",
+ reg_offset);
+ return -EPERM;
+ }
+
+ /*
+ * With MMU, DMA channels are not secured, so it doesn't matter where
+ * the WR COMP will be written to because it will go out with
+ * non-secured property
+ */
+ if (goya->hw_cap_initialized & HW_CAP_MMU)
+ return 0;
+
+ sob_start_addr = lower_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0);
+ sob_end_addr = lower_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_1023);
+
+ if ((wreg_pkt->value < sob_start_addr) ||
+ (wreg_pkt->value > sob_end_addr)) {
+
+ dev_err(hdev->dev, "WREG32 packet with illegal value 0x%x\n",
+ wreg_pkt->value);
+ return -EPERM;
+ }
+
+ return 0;
+}
+
+static int goya_validate_cb(struct hl_device *hdev,
+ struct hl_cs_parser *parser, bool is_mmu)
+{
+ u32 cb_parsed_length = 0;
+ int rc = 0;
+
+ parser->patched_cb_size = 0;
+
+ /* cb_user_size is more than 0 so loop will always be executed */
+ while (cb_parsed_length < parser->user_cb_size) {
+ enum packet_id pkt_id;
+ u16 pkt_size;
+ void *user_pkt;
+
+ user_pkt = (void *) (uintptr_t)
+ (parser->user_cb->kernel_address + cb_parsed_length);
+
+ pkt_id = (enum packet_id) (((*(u64 *) user_pkt) &
+ PACKET_HEADER_PACKET_ID_MASK) >>
+ PACKET_HEADER_PACKET_ID_SHIFT);
+
+ pkt_size = goya_packet_sizes[pkt_id];
+ cb_parsed_length += pkt_size;
+ if (cb_parsed_length > parser->user_cb_size) {
+ dev_err(hdev->dev,
+ "packet 0x%x is out of CB boundary\n", pkt_id);
+ rc = -EINVAL;
+ break;
+ }
+
+ switch (pkt_id) {
+ case PACKET_WREG_32:
+ /*
+ * Although it is validated after copy in patch_cb(),
+ * need to validate here as well because patch_cb() is
+ * not called in MMU path while this function is called
+ */
+ rc = goya_validate_wreg32(hdev, parser, user_pkt);
+ break;
+
+ case PACKET_WREG_BULK:
+ dev_err(hdev->dev,
+ "User not allowed to use WREG_BULK\n");
+ rc = -EPERM;
+ break;
+
+ case PACKET_MSG_PROT:
+ dev_err(hdev->dev,
+ "User not allowed to use MSG_PROT\n");
+ rc = -EPERM;
+ break;
+
+ case PACKET_CP_DMA:
+ dev_err(hdev->dev, "User not allowed to use CP_DMA\n");
+ rc = -EPERM;
+ break;
+
+ case PACKET_STOP:
+ dev_err(hdev->dev, "User not allowed to use STOP\n");
+ rc = -EPERM;
+ break;
+
+ case PACKET_LIN_DMA:
+ if (is_mmu)
+ rc = goya_validate_dma_pkt_mmu(hdev, parser,
+ user_pkt);
+ else
+ rc = goya_validate_dma_pkt_no_mmu(hdev, parser,
+ user_pkt);
+ break;
+
+ case PACKET_MSG_LONG:
+ case PACKET_MSG_SHORT:
+ case PACKET_FENCE:
+ case PACKET_NOP:
+ parser->patched_cb_size += pkt_size;
+ break;
+
+ default:
+ dev_err(hdev->dev, "Invalid packet header 0x%x\n",
+ pkt_id);
+ rc = -EINVAL;
+ break;
+ }
+
+ if (rc)
+ break;
+ }
+
+ /*
+ * The new CB should have space at the end for two MSG_PROT packets:
+ * 1. A packet that will act as a completion packet
+ * 2. A packet that will generate MSI-X interrupt
+ */
+ parser->patched_cb_size += sizeof(struct packet_msg_prot) * 2;
+
+ return rc;
+}
+
+static int goya_patch_dma_packet(struct hl_device *hdev,
+ struct hl_cs_parser *parser,
+ struct packet_lin_dma *user_dma_pkt,
+ struct packet_lin_dma *new_dma_pkt,
+ u32 *new_dma_pkt_size)
+{
+ struct hl_userptr *userptr;
+ struct scatterlist *sg, *sg_next_iter;
+ u32 count, len, dma_desc_cnt, len_next;
+ dma_addr_t dma_addr, dma_addr_next;
+ enum goya_dma_direction user_dir;
+ u64 device_memory_addr, addr;
+ enum dma_data_direction dir;
+ struct sg_table *sgt;
+ bool skip_host_mem_pin = false;
+ bool user_memset;
+ u32 user_rdcomp_mask, user_wrcomp_mask;
+
+ user_dir = (user_dma_pkt->ctl & GOYA_PKT_LIN_DMA_CTL_DMA_DIR_MASK) >>
+ GOYA_PKT_LIN_DMA_CTL_DMA_DIR_SHIFT;
+
+ user_memset = (user_dma_pkt->ctl & GOYA_PKT_LIN_DMA_CTL_MEMSET_MASK) >>
+ GOYA_PKT_LIN_DMA_CTL_MEMSET_SHIFT;
+
+ if ((user_dir == DMA_DRAM_TO_SRAM) || (user_dir == DMA_SRAM_TO_DRAM) ||
+ (user_dma_pkt->tsize == 0)) {
+ memcpy(new_dma_pkt, user_dma_pkt, sizeof(*new_dma_pkt));
+ *new_dma_pkt_size = sizeof(*new_dma_pkt);
+ return 0;
+ }
+
+ if ((user_dir == DMA_HOST_TO_DRAM) || (user_dir == DMA_HOST_TO_SRAM)) {
+ addr = user_dma_pkt->src_addr;
+ device_memory_addr = user_dma_pkt->dst_addr;
+ dir = DMA_TO_DEVICE;
+ if (user_memset)
+ skip_host_mem_pin = true;
+ } else {
+ addr = user_dma_pkt->dst_addr;
+ device_memory_addr = user_dma_pkt->src_addr;
+ dir = DMA_FROM_DEVICE;
+ }
+
+ if ((!skip_host_mem_pin) &&
+ (hl_userptr_is_pinned(hdev, addr, user_dma_pkt->tsize,
+ parser->job_userptr_list, &userptr) == false)) {
+ dev_err(hdev->dev, "Userptr 0x%llx + 0x%x NOT mapped\n",
+ addr, user_dma_pkt->tsize);
+ return -EFAULT;
+ }
+
+ if ((user_memset) && (dir == DMA_TO_DEVICE)) {
+ memcpy(new_dma_pkt, user_dma_pkt, sizeof(*user_dma_pkt));
+ *new_dma_pkt_size = sizeof(*user_dma_pkt);
+ return 0;
+ }
+
+ user_rdcomp_mask =
+ (user_dma_pkt->ctl & GOYA_PKT_LIN_DMA_CTL_RDCOMP_MASK);
+
+ user_wrcomp_mask =
+ (user_dma_pkt->ctl & GOYA_PKT_LIN_DMA_CTL_WRCOMP_MASK);
+
+ sgt = userptr->sgt;
+ dma_desc_cnt = 0;
+
+ for_each_sg(sgt->sgl, sg, sgt->nents, count) {
+ len = sg_dma_len(sg);
+ dma_addr = sg_dma_address(sg);
+
+ if (len == 0)
+ break;
+
+ while ((count + 1) < sgt->nents) {
+ sg_next_iter = sg_next(sg);
+ len_next = sg_dma_len(sg_next_iter);
+ dma_addr_next = sg_dma_address(sg_next_iter);
+
+ if (len_next == 0)
+ break;
+
+ if ((dma_addr + len == dma_addr_next) &&
+ (len + len_next <= DMA_MAX_TRANSFER_SIZE)) {
+ len += len_next;
+ count++;
+ sg = sg_next_iter;
+ } else {
+ break;
+ }
+ }
+
+ new_dma_pkt->ctl = user_dma_pkt->ctl;
+ if (likely(dma_desc_cnt))
+ new_dma_pkt->ctl &= ~GOYA_PKT_CTL_EB_MASK;
+ new_dma_pkt->ctl &= ~(GOYA_PKT_LIN_DMA_CTL_RDCOMP_MASK |
+ GOYA_PKT_LIN_DMA_CTL_WRCOMP_MASK);
+ new_dma_pkt->tsize = len;
+
+ dma_addr += hdev->asic_prop.host_phys_base_address;
+
+ if (dir == DMA_TO_DEVICE) {
+ new_dma_pkt->src_addr = dma_addr;
+ new_dma_pkt->dst_addr = device_memory_addr;
+ } else {
+ new_dma_pkt->src_addr = device_memory_addr;
+ new_dma_pkt->dst_addr = dma_addr;
+ }
+
+ if (!user_memset)
+ device_memory_addr += len;
+ dma_desc_cnt++;
+ new_dma_pkt++;
+ }
+
+ if (!dma_desc_cnt) {
+ dev_err(hdev->dev,
+ "Error of 0 SG entries when patching DMA packet\n");
+ return -EFAULT;
+ }
+
+ /* Fix the last dma packet - rdcomp/wrcomp must be as user set them */
+ new_dma_pkt--;
+ new_dma_pkt->ctl |= (user_rdcomp_mask | user_wrcomp_mask);
+
+ *new_dma_pkt_size = dma_desc_cnt * sizeof(struct packet_lin_dma);
+
+ return 0;
+}
+
+static int goya_patch_cb(struct hl_device *hdev,
+ struct hl_cs_parser *parser)
+{
+ u32 cb_parsed_length = 0;
+ u32 cb_patched_cur_length = 0;
+ int rc = 0;
+
+ /* cb_user_size is more than 0 so loop will always be executed */
+ while (cb_parsed_length < parser->user_cb_size) {
+ enum packet_id pkt_id;
+ u16 pkt_size;
+ u32 new_pkt_size = 0;
+ void *user_pkt, *kernel_pkt;
+
+ user_pkt = (void *) (uintptr_t)
+ (parser->user_cb->kernel_address + cb_parsed_length);
+ kernel_pkt = (void *) (uintptr_t)
+ (parser->patched_cb->kernel_address +
+ cb_patched_cur_length);
+
+ pkt_id = (enum packet_id) (((*(u64 *) user_pkt) &
+ PACKET_HEADER_PACKET_ID_MASK) >>
+ PACKET_HEADER_PACKET_ID_SHIFT);
+
+ pkt_size = goya_packet_sizes[pkt_id];
+ cb_parsed_length += pkt_size;
+ if (cb_parsed_length > parser->user_cb_size) {
+ dev_err(hdev->dev,
+ "packet 0x%x is out of CB boundary\n", pkt_id);
+ rc = -EINVAL;
+ break;
+ }
+
+ switch (pkt_id) {
+ case PACKET_LIN_DMA:
+ rc = goya_patch_dma_packet(hdev, parser, user_pkt,
+ kernel_pkt, &new_pkt_size);
+ cb_patched_cur_length += new_pkt_size;
+ break;
+
+ case PACKET_WREG_32:
+ memcpy(kernel_pkt, user_pkt, pkt_size);
+ cb_patched_cur_length += pkt_size;
+ rc = goya_validate_wreg32(hdev, parser, kernel_pkt);
+ break;
+
+ case PACKET_WREG_BULK:
+ dev_err(hdev->dev,
+ "User not allowed to use WREG_BULK\n");
+ rc = -EPERM;
+ break;
+
+ case PACKET_MSG_PROT:
+ dev_err(hdev->dev,
+ "User not allowed to use MSG_PROT\n");
+ rc = -EPERM;
+ break;
+
+ case PACKET_CP_DMA:
+ dev_err(hdev->dev, "User not allowed to use CP_DMA\n");
+ rc = -EPERM;
+ break;
+
+ case PACKET_STOP:
+ dev_err(hdev->dev, "User not allowed to use STOP\n");
+ rc = -EPERM;
+ break;
+
+ case PACKET_MSG_LONG:
+ case PACKET_MSG_SHORT:
+ case PACKET_FENCE:
+ case PACKET_NOP:
+ memcpy(kernel_pkt, user_pkt, pkt_size);
+ cb_patched_cur_length += pkt_size;
+ break;
+
+ default:
+ dev_err(hdev->dev, "Invalid packet header 0x%x\n",
+ pkt_id);
+ rc = -EINVAL;
+ break;
+ }
+
+ if (rc)
+ break;
+ }
+
+ return rc;
+}
+
+static int goya_parse_cb_mmu(struct hl_device *hdev,
+ struct hl_cs_parser *parser)
+{
+ u64 patched_cb_handle;
+ u32 patched_cb_size;
+ struct hl_cb *user_cb;
+ int rc;
+
+ /*
+ * The new CB should have space at the end for two MSG_PROT pkt:
+ * 1. A packet that will act as a completion packet
+ * 2. A packet that will generate MSI-X interrupt
+ */
+ parser->patched_cb_size = parser->user_cb_size +
+ sizeof(struct packet_msg_prot) * 2;
+
+ rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr,
+ parser->patched_cb_size,
+ &patched_cb_handle, HL_KERNEL_ASID_ID);
+
+ if (rc) {
+ dev_err(hdev->dev,
+ "Failed to allocate patched CB for DMA CS %d\n",
+ rc);
+ return rc;
+ }
+
+ patched_cb_handle >>= PAGE_SHIFT;
+ parser->patched_cb = hl_cb_get(hdev, &hdev->kernel_cb_mgr,
+ (u32) patched_cb_handle);
+ /* hl_cb_get should never fail here so use kernel WARN */
+ WARN(!parser->patched_cb, "DMA CB handle invalid 0x%x\n",
+ (u32) patched_cb_handle);
+ if (!parser->patched_cb) {
+ rc = -EFAULT;
+ goto out;
+ }
+
+ /*
+ * The check that parser->user_cb_size <= parser->user_cb->size was done
+ * in validate_queue_index().
+ */
+ memcpy((void *) (uintptr_t) parser->patched_cb->kernel_address,
+ (void *) (uintptr_t) parser->user_cb->kernel_address,
+ parser->user_cb_size);
+
+ patched_cb_size = parser->patched_cb_size;
+
+ /* validate patched CB instead of user CB */
+ user_cb = parser->user_cb;
+ parser->user_cb = parser->patched_cb;
+ rc = goya_validate_cb(hdev, parser, true);
+ parser->user_cb = user_cb;
+
+ if (rc) {
+ hl_cb_put(parser->patched_cb);
+ goto out;
+ }
+
+ if (patched_cb_size != parser->patched_cb_size) {
+ dev_err(hdev->dev, "user CB size mismatch\n");
+ hl_cb_put(parser->patched_cb);
+ rc = -EINVAL;
+ goto out;
+ }
+
+out:
+ /*
+ * Always call cb destroy here because we still have 1 reference
+ * to it by calling cb_get earlier. After the job will be completed,
+ * cb_put will release it, but here we want to remove it from the
+ * idr
+ */
+ hl_cb_destroy(hdev, &hdev->kernel_cb_mgr,
+ patched_cb_handle << PAGE_SHIFT);
+
+ return rc;
+}
+
+int goya_parse_cb_no_mmu(struct hl_device *hdev, struct hl_cs_parser *parser)
+{
+ u64 patched_cb_handle;
+ int rc;
+
+ rc = goya_validate_cb(hdev, parser, false);
+
+ if (rc)
+ goto free_userptr;
+
+ rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr,
+ parser->patched_cb_size,
+ &patched_cb_handle, HL_KERNEL_ASID_ID);
+ if (rc) {
+ dev_err(hdev->dev,
+ "Failed to allocate patched CB for DMA CS %d\n", rc);
+ goto free_userptr;
+ }
+
+ patched_cb_handle >>= PAGE_SHIFT;
+ parser->patched_cb = hl_cb_get(hdev, &hdev->kernel_cb_mgr,
+ (u32) patched_cb_handle);
+ /* hl_cb_get should never fail here so use kernel WARN */
+ WARN(!parser->patched_cb, "DMA CB handle invalid 0x%x\n",
+ (u32) patched_cb_handle);
+ if (!parser->patched_cb) {
+ rc = -EFAULT;
+ goto out;
+ }
+
+ rc = goya_patch_cb(hdev, parser);
+
+ if (rc)
+ hl_cb_put(parser->patched_cb);
+
+out:
+ /*
+ * Always call cb destroy here because we still have 1 reference
+ * to it by calling cb_get earlier. After the job will be completed,
+ * cb_put will release it, but here we want to remove it from the
+ * idr
+ */
+ hl_cb_destroy(hdev, &hdev->kernel_cb_mgr,
+ patched_cb_handle << PAGE_SHIFT);
+
+free_userptr:
+ if (rc)
+ hl_userptr_delete_list(hdev, parser->job_userptr_list);
+ return rc;
+}
+
+int goya_parse_cb_no_ext_quque(struct hl_device *hdev,
+ struct hl_cs_parser *parser)
+{
+ struct asic_fixed_properties *asic_prop = &hdev->asic_prop;
+ struct goya_device *goya = hdev->asic_specific;
+
+ if (!(goya->hw_cap_initialized & HW_CAP_MMU)) {
+ /* For internal queue jobs, just check if cb address is valid */
+ if (hl_mem_area_inside_range(
+ (u64) (uintptr_t) parser->user_cb,
+ parser->user_cb_size,
+ asic_prop->sram_user_base_address,
+ asic_prop->sram_end_address))
+ return 0;
+
+ if (hl_mem_area_inside_range(
+ (u64) (uintptr_t) parser->user_cb,
+ parser->user_cb_size,
+ asic_prop->dram_user_base_address,
+ asic_prop->dram_end_address))
+ return 0;
+
+ dev_err(hdev->dev,
+ "Internal CB address 0x%llx + 0x%x is not in SRAM nor in DRAM\n",
+ (u64) (uintptr_t) parser->user_cb,
+ parser->user_cb_size);
+
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+int goya_cs_parser(struct hl_device *hdev, struct hl_cs_parser *parser)
+{
+ struct goya_device *goya = hdev->asic_specific;
+
+ if (!parser->ext_queue)
+ return goya_parse_cb_no_ext_quque(hdev, parser);
+
+ if ((goya->hw_cap_initialized & HW_CAP_MMU) && parser->use_virt_addr)
+ return goya_parse_cb_mmu(hdev, parser);
+ else
+ return goya_parse_cb_no_mmu(hdev, parser);
+}
+
+void goya_add_end_of_cb_packets(u64 kernel_address, u32 len, u64 cq_addr,
+ u32 cq_val, u32 msix_vec)
+{
+ struct packet_msg_prot *cq_pkt;
+
+ cq_pkt = (struct packet_msg_prot *) (uintptr_t)
+ (kernel_address + len - (sizeof(struct packet_msg_prot) * 2));
+
+ cq_pkt->ctl = (PACKET_MSG_PROT << GOYA_PKT_CTL_OPCODE_SHIFT) |
+ (1 << GOYA_PKT_CTL_EB_SHIFT) |
+ (1 << GOYA_PKT_CTL_MB_SHIFT);
+ cq_pkt->value = cq_val;
+ cq_pkt->addr = cq_addr;
+
+ cq_pkt++;
+
+ cq_pkt->ctl = (PACKET_MSG_PROT << GOYA_PKT_CTL_OPCODE_SHIFT) |
+ (1 << GOYA_PKT_CTL_MB_SHIFT);
+ cq_pkt->value = msix_vec & 0x7FF;
+ cq_pkt->addr = CFG_BASE + mmPCIE_DBI_MSIX_DOORBELL_OFF;
+}
+
static void goya_update_eq_ci(struct hl_device *hdev, u32 val)
{
WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_6, val);
}
+int goya_context_switch(struct hl_device *hdev, u32 asid)
+{
+ struct asic_fixed_properties *prop = &hdev->asic_prop;
+ struct packet_lin_dma *clear_sram_pkt;
+ struct hl_cs_parser parser;
+ struct hl_cs_job *job;
+ u32 cb_size;
+ struct hl_cb *cb;
+ int rc;
+
+ cb = hl_cb_kernel_create(hdev, PAGE_SIZE);
+ if (!cb)
+ return -EFAULT;
+
+ clear_sram_pkt = (struct packet_lin_dma *)
+ (uintptr_t) cb->kernel_address;
+
+ memset(clear_sram_pkt, 0, sizeof(*clear_sram_pkt));
+ cb_size = sizeof(*clear_sram_pkt);
+
+ clear_sram_pkt->ctl = ((PACKET_LIN_DMA << GOYA_PKT_CTL_OPCODE_SHIFT) |
+ (DMA_HOST_TO_SRAM << GOYA_PKT_LIN_DMA_CTL_DMA_DIR_SHIFT) |
+ (1 << GOYA_PKT_LIN_DMA_CTL_MEMSET_SHIFT) |
+ (1 << GOYA_PKT_LIN_DMA_CTL_WO_SHIFT) |
+ (1 << GOYA_PKT_CTL_RB_SHIFT) |
+ (1 << GOYA_PKT_CTL_MB_SHIFT));
+
+ clear_sram_pkt->src_addr = 0x7777777777777777ull;
+ clear_sram_pkt->dst_addr = prop->sram_base_address;
+ if (hdev->pldm)
+ clear_sram_pkt->tsize = 0x10000;
+ else
+ clear_sram_pkt->tsize = prop->sram_size;
+
+ job = hl_cs_allocate_job(hdev, true);
+ if (!job) {
+ dev_err(hdev->dev, "Failed to allocate a new job\n");
+ rc = -ENOMEM;
+ goto release_cb;
+ }
+
+ job->id = 0;
+ job->user_cb = cb;
+ job->user_cb->cs_cnt++;
+ job->user_cb_size = cb_size;
+ job->hw_queue_id = GOYA_QUEUE_ID_DMA_0;
+
+ parser.ctx_id = HL_KERNEL_ASID_ID;
+ parser.cs_sequence = 0;
+ parser.job_id = job->id;
+ parser.hw_queue_id = job->hw_queue_id;
+ parser.job_userptr_list = &job->userptr_list;
+ parser.user_cb = job->user_cb;
+ parser.user_cb_size = job->user_cb_size;
+ parser.ext_queue = job->ext_queue;
+ parser.use_virt_addr = hdev->mmu_enable;
+
+ rc = hdev->asic_funcs->cs_parser(hdev, &parser);
+ if (rc) {
+ dev_err(hdev->dev,
+ "Failed to parse kernel CB during context switch\n");
+ goto free_job;
+ }
+
+ job->patched_cb = parser.patched_cb;
+ job->job_cb_size = parser.patched_cb_size;
+ job->patched_cb->cs_cnt++;
+
+ rc = goya_send_job_on_qman0(hdev, job);
+
+ job->patched_cb->cs_cnt--;
+ hl_cb_put(job->patched_cb);
+
+free_job:
+ hl_userptr_delete_list(hdev, &job->userptr_list);
+ kfree(job);
+ cb->cs_cnt--;
+
+release_cb:
+ hl_cb_put(cb);
+ hl_cb_destroy(hdev, &hdev->kernel_cb_mgr, cb->id << PAGE_SHIFT);
+
+ return rc;
+}
+
+void goya_restore_phase_topology(struct hl_device *hdev)
+{
+ int i, num_of_sob_in_longs, num_of_mon_in_longs;
+
+ num_of_sob_in_longs =
+ ((mmSYNC_MNGR_SOB_OBJ_1023 - mmSYNC_MNGR_SOB_OBJ_0) + 4);
+
+ num_of_mon_in_longs =
+ ((mmSYNC_MNGR_MON_STATUS_255 - mmSYNC_MNGR_MON_STATUS_0) + 4);
+
+ for (i = 0 ; i < num_of_sob_in_longs ; i += 4)
+ WREG32(mmSYNC_MNGR_SOB_OBJ_0 + i, 0);
+
+ for (i = 0 ; i < num_of_mon_in_longs ; i += 4)
+ WREG32(mmSYNC_MNGR_MON_STATUS_0 + i, 0);
+
+ /* Flush all WREG to prevent race */
+ i = RREG32(mmSYNC_MNGR_SOB_OBJ_0);
+}
+
static void goya_get_axi_name(struct hl_device *hdev, u32 agent_id,
u16 event_type, char *axi_name, int len)
{
@@ -3608,6 +4673,59 @@ static void goya_disable_clock_gating(struct hl_device *hdev)
}
+static bool goya_is_device_idle(struct hl_device *hdev)
+{
+ u64 offset, dma_qm_reg, tpc_qm_reg, tpc_cmdq_reg, tpc_cfg_reg;
+ int i;
+
+ offset = mmDMA_QM_1_GLBL_STS0 - mmDMA_QM_0_GLBL_STS0;
+
+ for (i = 0 ; i < DMA_MAX_NUM ; i++) {
+ dma_qm_reg = mmDMA_QM_0_GLBL_STS0 + i * offset;
+
+ if ((RREG32(dma_qm_reg) & DMA_QM_IDLE_MASK) !=
+ DMA_QM_IDLE_MASK)
+ return false;
+ }
+
+ offset = mmTPC1_QM_GLBL_STS0 - mmTPC0_QM_GLBL_STS0;
+
+ for (i = 0 ; i < TPC_MAX_NUM ; i++) {
+ tpc_qm_reg = mmTPC0_QM_GLBL_STS0 + i * offset;
+ tpc_cmdq_reg = mmTPC0_CMDQ_GLBL_STS0 + i * offset;
+ tpc_cfg_reg = mmTPC0_CFG_STATUS + i * offset;
+
+ if ((RREG32(tpc_qm_reg) & TPC_QM_IDLE_MASK) !=
+ TPC_QM_IDLE_MASK)
+ return false;
+
+ if ((RREG32(tpc_cmdq_reg) & TPC_CMDQ_IDLE_MASK) !=
+ TPC_CMDQ_IDLE_MASK)
+ return false;
+
+ if ((RREG32(tpc_cfg_reg) & TPC_CFG_IDLE_MASK) !=
+ TPC_CFG_IDLE_MASK)
+ return false;
+ }
+
+ if ((RREG32(mmMME_QM_GLBL_STS0) & MME_QM_IDLE_MASK) !=
+ MME_QM_IDLE_MASK)
+ return false;
+
+ if ((RREG32(mmMME_CMDQ_GLBL_STS0) & MME_CMDQ_IDLE_MASK) !=
+ MME_CMDQ_IDLE_MASK)
+ return false;
+
+ if ((RREG32(mmMME_ARCH_STATUS) & MME_ARCH_IDLE_MASK) !=
+ MME_ARCH_IDLE_MASK)
+ return false;
+
+ if (RREG32(mmMME_SHADOW_0_STATUS) & MME_SHADOW_IDLE_MASK)
+ return false;
+
+ return true;
+}
+
static void goya_hw_queues_lock(struct hl_device *hdev)
{
struct goya_device *goya = hdev->asic_specific;
@@ -3700,7 +4818,14 @@ static const struct hl_asic_funcs goya_funcs = {
.dma_pool_free = goya_dma_pool_free,
.cpu_accessible_dma_pool_alloc = goya_cpu_accessible_dma_pool_alloc,
.cpu_accessible_dma_pool_free = goya_cpu_accessible_dma_pool_free,
+ .hl_dma_unmap_sg = goya_dma_unmap_sg,
+ .cs_parser = goya_cs_parser,
+ .asic_dma_map_sg = goya_dma_map_sg,
+ .get_dma_desc_list_size = goya_get_dma_desc_list_size,
+ .add_end_of_cb_packets = goya_add_end_of_cb_packets,
.update_eq_ci = goya_update_eq_ci,
+ .context_switch = goya_context_switch,
+ .restore_phase_topology = goya_restore_phase_topology,
.add_device_attr = goya_add_device_attr,
.handle_eqe = goya_handle_eqe,
.set_pll_profile = goya_set_pll_profile,
@@ -3708,6 +4833,7 @@ static const struct hl_asic_funcs goya_funcs = {
.send_heartbeat = goya_send_heartbeat,
.enable_clock_gating = goya_init_clock_gating,
.disable_clock_gating = goya_disable_clock_gating,
+ .is_device_idle = goya_is_device_idle,
.soft_reset_late_init = goya_soft_reset_late_init,
.hw_queues_lock = goya_hw_queues_lock,
.hw_queues_unlock = goya_hw_queues_unlock,
diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index 744e37bbc2a6..9adc7c6ec08b 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -16,6 +16,9 @@
#include <linux/cdev.h>
#include <linux/iopoll.h>
#include <linux/irqreturn.h>
+#include <linux/dma-fence.h>
+#include <linux/dma-direction.h>
+#include <linux/scatterlist.h>
#define HL_NAME "habanalabs"
@@ -31,6 +34,11 @@
#define HL_MAX_QUEUES 128
+#define HL_MAX_JOBS_PER_CS 64
+
+/* MUST BE POWER OF 2 and larger than 1 */
+#define HL_MAX_PENDING_CS 64
+
struct hl_device;
struct hl_fpriv;
@@ -62,6 +70,16 @@ struct hw_queue_properties {
};
/**
+ * enum vm_type_t - virtual memory mapping request information.
+ * @VM_TYPE_USERPTR: mapping of user memory to device virtual address.
+ * @VM_TYPE_PHYS_LIST: mapping of DRAM memory to device virtual address.
+ */
+enum vm_type_t {
+ VM_TYPE_USERPTR,
+ VM_TYPE_PHYS_LIST
+};
+
+/**
* enum hl_device_hw_state - H/W device state. use this to understand whether
* to do reset before hw_init or not
* @HL_DEVICE_HW_STATE_CLEAN: H/W state is clean. i.e. after hard reset
@@ -147,6 +165,19 @@ struct asic_fixed_properties {
u8 tpc_enabled_mask;
};
+/**
+ * struct hl_dma_fence - wrapper for fence object used by command submissions.
+ * @base_fence: kernel fence object.
+ * @lock: spinlock to protect fence.
+ * @hdev: habanalabs device structure.
+ * @cs_seq: command submission sequence number.
+ */
+struct hl_dma_fence {
+ struct dma_fence base_fence;
+ spinlock_t lock;
+ struct hl_device *hdev;
+ u64 cs_seq;
+};
/*
* Command Buffers
@@ -175,6 +206,7 @@ struct hl_cb_mgr {
* @mmap_size: Holds the CB's size that was mmaped.
* @size: holds the CB's size.
* @id: the CB's ID.
+ * @cs_cnt: holds number of CS that this CB participates in.
* @ctx_id: holds the ID of the owner's context.
* @mmap: true if the CB is currently mmaped to user.
* @is_pool: true if CB was acquired from the pool, false otherwise.
@@ -189,6 +221,7 @@ struct hl_cb {
u32 mmap_size;
u32 size;
u32 id;
+ u32 cs_cnt;
u32 ctx_id;
u8 mmap;
u8 is_pool;
@@ -313,6 +346,8 @@ enum hl_asic_type {
ASIC_INVALID
};
+struct hl_cs_parser;
+
/**
* enum hl_pm_mng_profile - power management profile.
* @PM_AUTO: internal clock is set by KMD.
@@ -372,7 +407,14 @@ enum hl_pll_frequency {
* @dma_pool_free: free small DMA allocation from pool.
* @cpu_accessible_dma_pool_alloc: allocate CPU PQ packet from DMA pool.
* @cpu_accessible_dma_pool_free: free CPU PQ packet from DMA pool.
+ * @hl_dma_unmap_sg: DMA unmap scatter-gather list.
+ * @cs_parser: parse Command Submission.
+ * @asic_dma_map_sg: DMA map scatter-gather list.
+ * @get_dma_desc_list_size: get number of LIN_DMA packets required for CB.
+ * @add_end_of_cb_packets: Add packets to the end of CB, if device requires it.
* @update_eq_ci: update event queue CI.
+ * @context_switch: called upon ASID context switch.
+ * @restore_phase_topology: clear all SOBs amd MONs.
* @add_device_attr: add ASIC specific device attributes.
* @handle_eqe: handle event queue entry (IRQ) from ArmCP.
* @set_pll_profile: change PLL profile (manual/automatic).
@@ -380,6 +422,7 @@ enum hl_pll_frequency {
* @send_heartbeat: send is-alive packet to ArmCP and verify response.
* @enable_clock_gating: enable clock gating for reducing power consumption.
* @disable_clock_gating: disable clock for accessing registers on HBW.
+ * @is_device_idle: return true if device is idle, false otherwise.
* @soft_reset_late_init: perform certain actions needed after soft reset.
* @hw_queues_lock: acquire H/W queues lock.
* @hw_queues_unlock: release H/W queues lock.
@@ -419,7 +462,20 @@ struct hl_asic_funcs {
size_t size, dma_addr_t *dma_handle);
void (*cpu_accessible_dma_pool_free)(struct hl_device *hdev,
size_t size, void *vaddr);
+ void (*hl_dma_unmap_sg)(struct hl_device *hdev,
+ struct scatterlist *sg, int nents,
+ enum dma_data_direction dir);
+ int (*cs_parser)(struct hl_device *hdev, struct hl_cs_parser *parser);
+ int (*asic_dma_map_sg)(struct hl_device *hdev,
+ struct scatterlist *sg, int nents,
+ enum dma_data_direction dir);
+ u32 (*get_dma_desc_list_size)(struct hl_device *hdev,
+ struct sg_table *sgt);
+ void (*add_end_of_cb_packets)(u64 kernel_address, u32 len, u64 cq_addr,
+ u32 cq_val, u32 msix_num);
void (*update_eq_ci)(struct hl_device *hdev, u32 val);
+ int (*context_switch)(struct hl_device *hdev, u32 asid);
+ void (*restore_phase_topology)(struct hl_device *hdev);
void (*add_device_attr)(struct hl_device *hdev,
struct attribute_group *dev_attr_grp);
void (*handle_eqe)(struct hl_device *hdev,
@@ -430,6 +486,7 @@ struct hl_asic_funcs {
int (*send_heartbeat)(struct hl_device *hdev);
void (*enable_clock_gating)(struct hl_device *hdev);
void (*disable_clock_gating)(struct hl_device *hdev);
+ bool (*is_device_idle)(struct hl_device *hdev);
int (*soft_reset_late_init)(struct hl_device *hdev);
void (*hw_queues_lock)(struct hl_device *hdev);
void (*hw_queues_unlock)(struct hl_device *hdev);
@@ -453,12 +510,28 @@ struct hl_asic_funcs {
* @hdev: pointer to the device structure.
* @refcount: reference counter for the context. Context is released only when
* this hits 0l. It is incremented on CS and CS_WAIT.
+ * @cs_pending: array of DMA fence objects representing pending CS.
+ * @cs_sequence: sequence number for CS. Value is assigned to a CS and passed
+ * to user so user could inquire about CS. It is used as
+ * index to cs_pending array.
+ * @cs_lock: spinlock to protect cs_sequence.
+ * @thread_restore_token: token to prevent multiple threads of the same context
+ * from running the restore phase. Only one thread
+ * should run it.
+ * @thread_restore_wait_token: token to prevent the threads that didn't run
+ * the restore phase from moving to their execution
+ * phase before the restore phase has finished.
* @asid: context's unique address space ID in the device's MMU.
*/
struct hl_ctx {
struct hl_fpriv *hpriv;
struct hl_device *hdev;
struct kref refcount;
+ struct dma_fence *cs_pending[HL_MAX_PENDING_CS];
+ u64 cs_sequence;
+ spinlock_t cs_lock;
+ atomic_t thread_restore_token;
+ u32 thread_restore_wait_token;
u32 asid;
};
@@ -473,14 +546,129 @@ struct hl_ctx_mgr {
};
+
+/*
+ * COMMAND SUBMISSIONS
+ */
+
+/**
+ * struct hl_userptr - memory mapping chunk information
+ * @vm_type: type of the VM.
+ * @job_node: linked-list node for hanging the object on the Job's list.
+ * @vec: pointer to the frame vector.
+ * @sgt: pointer to the scatter-gather table that holds the pages.
+ * @dir: for DMA unmapping, the direction must be supplied, so save it.
+ * @debugfs_list: node in debugfs list of command submissions.
+ * @addr: user-space virtual pointer to the start of the memory area.
+ * @size: size of the memory area to pin & map.
+ * @dma_mapped: true if the SG was mapped to DMA addresses, false otherwise.
+ */
+struct hl_userptr {
+ enum vm_type_t vm_type; /* must be first */
+ struct list_head job_node;
+ struct frame_vector *vec;
+ struct sg_table *sgt;
+ enum dma_data_direction dir;
+ struct list_head debugfs_list;
+ u64 addr;
+ u32 size;
+ u8 dma_mapped;
+};
+
+/**
+ * struct hl_cs - command submission.
+ * @jobs_in_queue_cnt: per each queue, maintain counter of submitted jobs.
+ * @ctx: the context this CS belongs to.
+ * @job_list: list of the CS's jobs in the various queues.
+ * @job_lock: spinlock for the CS's jobs list. Needed for free_job.
+ * @refcount: reference counter for usage of the CS.
+ * @fence: pointer to the fence object of this CS.
+ * @work_tdr: delayed work node for TDR.
+ * @mirror_node : node in device mirror list of command submissions.
+ * @sequence: the sequence number of this CS.
+ * @submitted: true if CS was submitted to H/W.
+ * @completed: true if CS was completed by device.
+ * @timedout : true if CS was timedout.
+ * @tdr_active: true if TDR was activated for this CS (to prevent
+ * double TDR activation).
+ * @aborted: true if CS was aborted due to some device error.
+ */
+struct hl_cs {
+ u8 jobs_in_queue_cnt[HL_MAX_QUEUES];
+ struct hl_ctx *ctx;
+ struct list_head job_list;
+ spinlock_t job_lock;
+ struct kref refcount;
+ struct dma_fence *fence;
+ struct delayed_work work_tdr;
+ struct list_head mirror_node;
+ u64 sequence;
+ u8 submitted;
+ u8 completed;
+ u8 timedout;
+ u8 tdr_active;
+ u8 aborted;
+};
+
/**
* struct hl_cs_job - command submission job.
+ * @cs_node: the node to hang on the CS jobs list.
+ * @cs: the CS this job belongs to.
+ * @user_cb: the CB we got from the user.
+ * @patched_cb: in case of patching, this is internal CB which is submitted on
+ * the queue instead of the CB we got from the IOCTL.
* @finish_work: workqueue object to run when job is completed.
+ * @userptr_list: linked-list of userptr mappings that belong to this job and
+ * wait for completion.
* @id: the id of this job inside a CS.
+ * @hw_queue_id: the id of the H/W queue this job is submitted to.
+ * @user_cb_size: the actual size of the CB we got from the user.
+ * @job_cb_size: the actual size of the CB that we put on the queue.
+ * @ext_queue: whether the job is for external queue or internal queue.
*/
struct hl_cs_job {
+ struct list_head cs_node;
+ struct hl_cs *cs;
+ struct hl_cb *user_cb;
+ struct hl_cb *patched_cb;
struct work_struct finish_work;
+ struct list_head userptr_list;
u32 id;
+ u32 hw_queue_id;
+ u32 user_cb_size;
+ u32 job_cb_size;
+ u8 ext_queue;
+};
+
+/**
+ * struct hl_cs_parser - command submission paerser properties.
+ * @user_cb: the CB we got from the user.
+ * @patched_cb: in case of patching, this is internal CB which is submitted on
+ * the queue instead of the CB we got from the IOCTL.
+ * @job_userptr_list: linked-list of userptr mappings that belong to the related
+ * job and wait for completion.
+ * @cs_sequence: the sequence number of the related CS.
+ * @ctx_id: the ID of the context the related CS belongs to.
+ * @hw_queue_id: the id of the H/W queue this job is submitted to.
+ * @user_cb_size: the actual size of the CB we got from the user.
+ * @patched_cb_size: the size of the CB after parsing.
+ * @ext_queue: whether the job is for external queue or internal queue.
+ * @job_id: the id of the related job inside the related CS.
+ * @use_virt_addr: whether to treat the addresses in the CB as virtual during
+ * parsing.
+ */
+struct hl_cs_parser {
+ struct hl_cb *user_cb;
+ struct hl_cb *patched_cb;
+ struct list_head *job_userptr_list;
+ u64 cs_sequence;
+ u32 ctx_id;
+ u32 hw_queue_id;
+ u32 user_cb_size;
+ u32 patched_cb_size;
+ u8 ext_queue;
+ u8 job_id;
+ u8 use_virt_addr;
};
@@ -497,6 +685,7 @@ struct hl_cs_job {
* @ctx_mgr: context manager to handle multiple context for this FD.
* @cb_mgr: command buffer manager to handle multiple buffers for this FD.
* @refcount: number of related contexts.
+ * @restore_phase_mutex: lock for context switch and restore phase.
*/
struct hl_fpriv {
struct hl_device *hdev;
@@ -506,6 +695,7 @@ struct hl_fpriv {
struct hl_ctx_mgr ctx_mgr;
struct hl_cb_mgr cb_mgr;
struct kref refcount;
+ struct mutex restore_phase_mutex;
};
@@ -577,6 +767,8 @@ struct hl_device_reset_work {
* @eq_wq: work queue of event queue for executing work in process context.
* @kernel_ctx: KMD context structure.
* @kernel_queues: array of hl_hw_queue.
+ * @hw_queues_mirror_list: CS mirror list for TDR.
+ * @hw_queues_mirror_lock: protects hw_queues_mirror_list.
* @kernel_cb_mgr: command buffer manager for creating/destroying/handling CGs.
* @event_queue: event queue for IRQ from ArmCP.
* @dma_pool: DMA pool for small allocations.
@@ -604,6 +796,7 @@ struct hl_device_reset_work {
* @in_reset: is device in reset flow.
* @curr_pll_profile: current PLL profile.
* @fd_open_cnt: number of open user processes.
+ * @timeout_jiffies: device CS timeout value.
* @max_power: the max power of the device, as configured by the sysadmin. This
* value is saved so in case of hard-reset, KMD will restore this
* value and update the F/W after the re-initialization
@@ -617,7 +810,10 @@ struct hl_device_reset_work {
* @hwmon_initialized: is H/W monitor sensors was initialized.
* @hard_reset_pending: is there a hard reset work pending.
* @heartbeat: is heartbeat sanity check towards ArmCP enabled.
+ * @reset_on_lockup: true if a reset should be done in case of stuck CS, false
+ * otherwise.
* @init_done: is the initialization of the device done.
+ * @mmu_enable: is MMU enabled.
*/
struct hl_device {
struct pci_dev *pdev;
@@ -634,6 +830,8 @@ struct hl_device {
struct workqueue_struct *eq_wq;
struct hl_ctx *kernel_ctx;
struct hl_hw_queue *kernel_queues;
+ struct list_head hw_queues_mirror_list;
+ spinlock_t hw_queues_mirror_lock;
struct hl_cb_mgr kernel_cb_mgr;
struct hl_eq event_queue;
struct dma_pool *dma_pool;
@@ -661,6 +859,7 @@ struct hl_device {
atomic_t in_reset;
atomic_t curr_pll_profile;
atomic_t fd_open_cnt;
+ u64 timeout_jiffies;
u64 max_power;
u32 major;
u32 high_pll;
@@ -672,9 +871,11 @@ struct hl_device {
u8 hwmon_initialized;
u8 hard_reset_pending;
u8 heartbeat;
+ u8 reset_on_lockup;
u8 init_done;
/* Parameters for bring-up */
+ u8 mmu_enable;
u8 cpu_enable;
u8 reset_pcilink;
u8 cpu_queues_enable;
@@ -712,6 +913,58 @@ struct hl_ioctl_desc {
* Kernel module functions that can be accessed by entire module
*/
+/**
+ * hl_mem_area_inside_range() - Checks whether address+size are inside a range.
+ * @address: The start address of the area we want to validate.
+ * @size: The size in bytes of the area we want to validate.
+ * @range_start_address: The start address of the valid range.
+ * @range_end_address: The end address of the valid range.
+ *
+ * Return: true if the area is inside the valid range, false otherwise.
+ */
+static inline bool hl_mem_area_inside_range(u64 address, u32 size,
+ u64 range_start_address, u64 range_end_address)
+{
+ u64 end_address = address + size;
+
+ if ((address >= range_start_address) &&
+ (end_address <= range_end_address) &&
+ (end_address > address))
+ return true;
+
+ return false;
+}
+
+/**
+ * hl_mem_area_crosses_range() - Checks whether address+size crossing a range.
+ * @address: The start address of the area we want to validate.
+ * @size: The size in bytes of the area we want to validate.
+ * @range_start_address: The start address of the valid range.
+ * @range_end_address: The end address of the valid range.
+ *
+ * Return: true if the area overlaps part or all of the valid range,
+ * false otherwise.
+ */
+static inline bool hl_mem_area_crosses_range(u64 address, u32 size,
+ u64 range_start_address, u64 range_end_address)
+{
+ u64 end_address = address + size;
+
+ if ((address >= range_start_address) &&
+ (address < range_end_address))
+ return true;
+
+ if ((end_address >= range_start_address) &&
+ (end_address < range_end_address))
+ return true;
+
+ if ((address < range_start_address) &&
+ (end_address >= range_end_address))
+ return true;
+
+ return false;
+}
+
int hl_device_open(struct inode *inode, struct file *filp);
bool hl_device_disabled_or_in_reset(struct hl_device *hdev);
int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
@@ -725,8 +978,10 @@ int hl_hw_queues_create(struct hl_device *hdev);
void hl_hw_queues_destroy(struct hl_device *hdev);
int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id,
u32 cb_size, u64 cb_ptr);
+int hl_hw_queue_schedule_cs(struct hl_cs *cs);
u32 hl_hw_queue_add_ptr(u32 ptr, u16 val);
void hl_hw_queue_inc_ci_kernel(struct hl_device *hdev, u32 hw_queue_id);
+void hl_int_hw_queue_update_ci(struct hl_cs *cs);
void hl_hw_queue_reset(struct hl_device *hdev, bool hard_reset);
#define hl_queue_inc_ptr(p) hl_hw_queue_add_ptr(p, 1)
@@ -740,6 +995,8 @@ void hl_cq_reset(struct hl_device *hdev, struct hl_cq *q);
void hl_eq_reset(struct hl_device *hdev, struct hl_eq *q);
irqreturn_t hl_irq_handler_cq(int irq, void *arg);
irqreturn_t hl_irq_handler_eq(int irq, void *arg);
+u32 hl_cq_inc_ptr(u32 ptr);
+
int hl_asid_init(struct hl_device *hdev);
void hl_asid_fini(struct hl_device *hdev);
unsigned long hl_asid_alloc(struct hl_device *hdev);
@@ -748,9 +1005,13 @@ void hl_asid_free(struct hl_device *hdev, unsigned long asid);
int hl_ctx_create(struct hl_device *hdev, struct hl_fpriv *hpriv);
void hl_ctx_free(struct hl_device *hdev, struct hl_ctx *ctx);
int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx);
+void hl_ctx_do_release(struct kref *ref);
+void hl_ctx_get(struct hl_device *hdev, struct hl_ctx *ctx);
int hl_ctx_put(struct hl_ctx *ctx);
+struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq);
void hl_ctx_mgr_init(struct hl_ctx_mgr *mgr);
void hl_ctx_mgr_fini(struct hl_device *hdev, struct hl_ctx_mgr *mgr);
+
int hl_device_init(struct hl_device *hdev, struct class *hclass);
void hl_device_fini(struct hl_device *hdev);
int hl_device_suspend(struct hl_device *hdev);
@@ -782,8 +1043,20 @@ struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size);
int hl_cb_pool_init(struct hl_device *hdev);
int hl_cb_pool_fini(struct hl_device *hdev);
+void hl_cs_rollback_all(struct hl_device *hdev);
+struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev, bool ext_queue);
+
void goya_set_asic_funcs(struct hl_device *hdev);
+int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u32 size,
+ struct hl_userptr *userptr);
+int hl_unpin_host_memory(struct hl_device *hdev, struct hl_userptr *userptr);
+void hl_userptr_delete_list(struct hl_device *hdev,
+ struct list_head *userptr_list);
+bool hl_userptr_is_pinned(struct hl_device *hdev, u64 addr, u32 size,
+ struct list_head *userptr_list,
+ struct hl_userptr **userptr);
+
long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr);
void hl_set_frequency(struct hl_device *hdev, u32 pll_index, u64 freq);
long hl_get_temperature(struct hl_device *hdev, int sensor_index, u32 attr);
@@ -799,5 +1072,7 @@ void hl_set_max_power(struct hl_device *hdev, u64 value);
/* IOCTLs */
long hl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
int hl_cb_ioctl(struct hl_fpriv *hpriv, void *data);
+int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data);
+int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data);
#endif /* HABANALABSP_H_ */
diff --git a/drivers/misc/habanalabs/habanalabs_drv.c b/drivers/misc/habanalabs/habanalabs_drv.c
index b0bf77af1e40..77a1cc85e530 100644
--- a/drivers/misc/habanalabs/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/habanalabs_drv.c
@@ -24,6 +24,17 @@ static struct class *hl_class;
DEFINE_IDR(hl_devs_idr);
DEFINE_MUTEX(hl_devs_idr_lock);
+static int timeout_locked = 5;
+static int reset_on_lockup = 1;
+
+module_param(timeout_locked, int, 0444);
+MODULE_PARM_DESC(timeout_locked,
+ "Device lockup timeout in seconds (0 = disabled, default 5s)");
+
+module_param(reset_on_lockup, int, 0444);
+MODULE_PARM_DESC(reset_on_lockup,
+ "Do device reset on lockup (0 = no, 1 = yes, default yes)");
+
#define PCI_VENDOR_ID_HABANALABS 0x1da3
#define PCI_IDS_GOYA 0x0001
@@ -113,6 +124,7 @@ int hl_device_open(struct inode *inode, struct file *filp)
hpriv->hdev = hdev;
filp->private_data = hpriv;
hpriv->filp = filp;
+ mutex_init(&hpriv->restore_phase_mutex);
kref_init(&hpriv->refcount);
nonseekable_open(inode, filp);
@@ -140,6 +152,7 @@ out_err:
filp->private_data = NULL;
hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
hl_cb_mgr_fini(hpriv->hdev, &hpriv->cb_mgr);
+ mutex_destroy(&hpriv->restore_phase_mutex);
kfree(hpriv);
close_device:
@@ -172,8 +185,10 @@ int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
return -ENOMEM;
hdev->major = hl_major;
+ hdev->reset_on_lockup = reset_on_lockup;
/* Parameters for bring-up - set them to defaults */
+ hdev->mmu_enable = 0;
hdev->cpu_enable = 1;
hdev->reset_pcilink = 0;
hdev->cpu_queues_enable = 1;
@@ -193,6 +208,11 @@ int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
if (!hdev->cpu_queues_enable)
hdev->heartbeat = 0;
+ if (timeout_locked)
+ hdev->timeout_jiffies = msecs_to_jiffies(timeout_locked * 1000);
+ else
+ hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT;
+
hdev->disabled = true;
hdev->pdev = pdev; /* can be NULL in case of simulator device */
diff --git a/drivers/misc/habanalabs/habanalabs_ioctl.c b/drivers/misc/habanalabs/habanalabs_ioctl.c
index e56a51f6bab6..481db1a5e97e 100644
--- a/drivers/misc/habanalabs/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/habanalabs_ioctl.c
@@ -16,7 +16,9 @@
[_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func}
static const struct hl_ioctl_desc hl_ioctls[] = {
- HL_IOCTL_DEF(HL_IOCTL_CB, hl_cb_ioctl)
+ HL_IOCTL_DEF(HL_IOCTL_CB, hl_cb_ioctl),
+ HL_IOCTL_DEF(HL_IOCTL_CS, hl_cs_ioctl),
+ HL_IOCTL_DEF(HL_IOCTL_WAIT_CS, hl_cs_wait_ioctl)
};
#define HL_CORE_IOCTL_COUNT ARRAY_SIZE(hl_ioctls)
diff --git a/drivers/misc/habanalabs/hw_queue.c b/drivers/misc/habanalabs/hw_queue.c
index 2ec43f36cdb8..68dfda59a875 100644
--- a/drivers/misc/habanalabs/hw_queue.c
+++ b/drivers/misc/habanalabs/hw_queue.c
@@ -34,6 +34,29 @@ static inline int queue_free_slots(struct hl_hw_queue *q, u32 queue_len)
return (abs(delta) - queue_len);
}
+void hl_int_hw_queue_update_ci(struct hl_cs *cs)
+{
+ struct hl_device *hdev = cs->ctx->hdev;
+ struct hl_hw_queue *q;
+ int i;
+
+ hdev->asic_funcs->hw_queues_lock(hdev);
+
+ if (hdev->disabled)
+ goto out;
+
+ q = &hdev->kernel_queues[0];
+ for (i = 0 ; i < HL_MAX_QUEUES ; i++, q++) {
+ if (q->queue_type == QUEUE_TYPE_INT) {
+ q->ci += cs->jobs_in_queue_cnt[i];
+ q->ci &= ((q->int_queue_len << 1) - 1);
+ }
+ }
+
+out:
+ hdev->asic_funcs->hw_queues_unlock(hdev);
+}
+
/*
* ext_queue_submit_bd - Submit a buffer descriptor to an external queue
*
@@ -120,6 +143,37 @@ static int ext_queue_sanity_checks(struct hl_device *hdev,
}
/*
+ * int_queue_sanity_checks - perform some sanity checks on internal queue
+ *
+ * @hdev : pointer to hl_device structure
+ * @q : pointer to hl_hw_queue structure
+ * @num_of_entries : how many entries to check for space
+ *
+ * H/W queues spinlock should be taken before calling this function
+ *
+ * Perform the following:
+ * - Make sure we have enough space in the h/w queue
+ *
+ */
+static int int_queue_sanity_checks(struct hl_device *hdev,
+ struct hl_hw_queue *q,
+ int num_of_entries)
+{
+ int free_slots_cnt;
+
+ /* Check we have enough space in the queue */
+ free_slots_cnt = queue_free_slots(q, q->int_queue_len);
+
+ if (free_slots_cnt < num_of_entries) {
+ dev_dbg(hdev->dev, "Queue %d doesn't have room for %d CBs\n",
+ q->hw_queue_id, num_of_entries);
+ return -EAGAIN;
+ }
+
+ return 0;
+}
+
+/*
* hl_hw_queue_send_cb_no_cmpl - send a single CB (not a JOB) without completion
*
* @hdev: pointer to hl_device structure
@@ -166,6 +220,184 @@ out:
}
/*
+ * ext_hw_queue_schedule_job - submit an JOB to an external queue
+ *
+ * @job: pointer to the job that needs to be submitted to the queue
+ *
+ * This function must be called when the scheduler mutex is taken
+ *
+ */
+static void ext_hw_queue_schedule_job(struct hl_cs_job *job)
+{
+ struct hl_device *hdev = job->cs->ctx->hdev;
+ struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id];
+ struct hl_cq_entry cq_pkt;
+ struct hl_cq *cq;
+ u64 cq_addr;
+ struct hl_cb *cb;
+ u32 ctl;
+ u32 len;
+ u64 ptr;
+
+ /*
+ * Update the JOB ID inside the BD CTL so the device would know what
+ * to write in the completion queue
+ */
+ ctl = ((q->pi << BD_CTL_SHADOW_INDEX_SHIFT) & BD_CTL_SHADOW_INDEX_MASK);
+
+ cb = job->patched_cb;
+ len = job->job_cb_size;
+ ptr = cb->bus_address;
+
+ cq_pkt.data = (q->pi << CQ_ENTRY_SHADOW_INDEX_SHIFT)
+ & CQ_ENTRY_SHADOW_INDEX_MASK;
+ cq_pkt.data |= 1 << CQ_ENTRY_SHADOW_INDEX_VALID_SHIFT;
+ cq_pkt.data |= 1 << CQ_ENTRY_READY_SHIFT;
+
+ /*
+ * No need to protect pi_offset because scheduling to the
+ * H/W queues is done under the scheduler mutex
+ *
+ * No need to check if CQ is full because it was already
+ * checked in hl_queue_sanity_checks
+ */
+ cq = &hdev->completion_queue[q->hw_queue_id];
+ cq_addr = cq->bus_address +
+ hdev->asic_prop.host_phys_base_address;
+ cq_addr += cq->pi * sizeof(struct hl_cq_entry);
+
+ hdev->asic_funcs->add_end_of_cb_packets(cb->kernel_address, len,
+ cq_addr, cq_pkt.data, q->hw_queue_id);
+
+ q->shadow_queue[hl_pi_2_offset(q->pi)] = job;
+
+ cq->pi = hl_cq_inc_ptr(cq->pi);
+
+ ext_queue_submit_bd(hdev, q, ctl, len, ptr);
+}
+
+/*
+ * int_hw_queue_schedule_job - submit an JOB to an internal queue
+ *
+ * @job: pointer to the job that needs to be submitted to the queue
+ *
+ * This function must be called when the scheduler mutex is taken
+ *
+ */
+static void int_hw_queue_schedule_job(struct hl_cs_job *job)
+{
+ struct hl_device *hdev = job->cs->ctx->hdev;
+ struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id];
+ struct hl_bd bd;
+ u64 *pi, *pbd = (u64 *) &bd;
+
+ bd.ctl = 0;
+ bd.len = job->job_cb_size;
+ bd.ptr = (u64) (uintptr_t) job->user_cb;
+
+ pi = (u64 *) (uintptr_t) (q->kernel_address +
+ ((q->pi & (q->int_queue_len - 1)) * sizeof(bd)));
+
+ pi[0] = pbd[0];
+ pi[1] = pbd[1];
+
+ q->pi++;
+ q->pi &= ((q->int_queue_len << 1) - 1);
+
+ /* Flush PQ entry write. Relevant only for specific ASICs */
+ hdev->asic_funcs->flush_pq_write(hdev, pi, pbd[0]);
+
+ hdev->asic_funcs->ring_doorbell(hdev, q->hw_queue_id, q->pi);
+}
+
+/*
+ * hl_hw_queue_schedule_cs - schedule a command submission
+ *
+ * @job : pointer to the CS
+ *
+ */
+int hl_hw_queue_schedule_cs(struct hl_cs *cs)
+{
+ struct hl_device *hdev = cs->ctx->hdev;
+ struct hl_cs_job *job, *tmp;
+ struct hl_hw_queue *q;
+ int rc = 0, i, cq_cnt;
+
+ hdev->asic_funcs->hw_queues_lock(hdev);
+
+ if (hl_device_disabled_or_in_reset(hdev)) {
+ dev_err(hdev->dev,
+ "device is disabled or in reset, CS rejected!\n");
+ rc = -EPERM;
+ goto out;
+ }
+
+ q = &hdev->kernel_queues[0];
+ /* This loop assumes all external queues are consecutive */
+ for (i = 0, cq_cnt = 0 ; i < HL_MAX_QUEUES ; i++, q++) {
+ if (q->queue_type == QUEUE_TYPE_EXT) {
+ if (cs->jobs_in_queue_cnt[i]) {
+ rc = ext_queue_sanity_checks(hdev, q,
+ cs->jobs_in_queue_cnt[i], true);
+ if (rc)
+ goto unroll_cq_resv;
+ cq_cnt++;
+ }
+ } else if (q->queue_type == QUEUE_TYPE_INT) {
+ if (cs->jobs_in_queue_cnt[i]) {
+ rc = int_queue_sanity_checks(hdev, q,
+ cs->jobs_in_queue_cnt[i]);
+ if (rc)
+ goto unroll_cq_resv;
+ }
+ }
+ }
+
+ spin_lock(&hdev->hw_queues_mirror_lock);
+ list_add_tail(&cs->mirror_node, &hdev->hw_queues_mirror_list);
+
+ /* Queue TDR if the CS is the first entry and if timeout is wanted */
+ if ((hdev->timeout_jiffies != MAX_SCHEDULE_TIMEOUT) &&
+ (list_first_entry(&hdev->hw_queues_mirror_list,
+ struct hl_cs, mirror_node) == cs)) {
+ cs->tdr_active = true;
+ schedule_delayed_work(&cs->work_tdr, hdev->timeout_jiffies);
+ spin_unlock(&hdev->hw_queues_mirror_lock);
+ } else {
+ spin_unlock(&hdev->hw_queues_mirror_lock);
+ }
+
+ list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node) {
+ if (job->ext_queue)
+ ext_hw_queue_schedule_job(job);
+ else
+ int_hw_queue_schedule_job(job);
+ }
+
+ cs->submitted = true;
+
+ goto out;
+
+unroll_cq_resv:
+ /* This loop assumes all external queues are consecutive */
+ q = &hdev->kernel_queues[0];
+ for (i = 0 ; (i < HL_MAX_QUEUES) && (cq_cnt > 0) ; i++, q++) {
+ if ((q->queue_type == QUEUE_TYPE_EXT) &&
+ (cs->jobs_in_queue_cnt[i])) {
+ atomic_t *free_slots =
+ &hdev->completion_queue[i].free_slots_cnt;
+ atomic_add(cs->jobs_in_queue_cnt[i], free_slots);
+ cq_cnt--;
+ }
+ }
+
+out:
+ hdev->asic_funcs->hw_queues_unlock(hdev);
+
+ return rc;
+}
+
+/*
* hl_hw_queue_inc_ci_kernel - increment ci for kernel's queue
*
* @hdev: pointer to hl_device structure
diff --git a/drivers/misc/habanalabs/memory.c b/drivers/misc/habanalabs/memory.c
new file mode 100644
index 000000000000..ad14376a1c25
--- /dev/null
+++ b/drivers/misc/habanalabs/memory.c
@@ -0,0 +1,198 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright 2016-2019 HabanaLabs, Ltd.
+ * All Rights Reserved.
+ */
+
+#include "habanalabs.h"
+
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+
+/*
+ * hl_pin_host_memory - pins a chunk of host memory
+ *
+ * @hdev : pointer to the habanalabs device structure
+ * @addr : the user-space virtual address of the memory area
+ * @size : the size of the memory area
+ * @userptr : pointer to hl_userptr structure
+ *
+ * This function does the following:
+ * - Pins the physical pages
+ * - Create a SG list from those pages
+ */
+int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u32 size,
+ struct hl_userptr *userptr)
+{
+ u64 start, end;
+ u32 npages, offset;
+ int rc;
+
+ if (!size) {
+ dev_err(hdev->dev, "size to pin is invalid - %d\n",
+ size);
+ return -EINVAL;
+ }
+
+ if (!access_ok((void __user *) (uintptr_t) addr, size)) {
+ dev_err(hdev->dev, "user pointer is invalid - 0x%llx\n",
+ addr);
+ return -EFAULT;
+ }
+
+ /*
+ * If the combination of the address and size requested for this memory
+ * region causes an integer overflow, return error.
+ */
+ if (((addr + size) < addr) ||
+ PAGE_ALIGN(addr + size) < (addr + size)) {
+ dev_err(hdev->dev,
+ "user pointer 0x%llx + %u causes integer overflow\n",
+ addr, size);
+ return -EINVAL;
+ }
+
+ start = addr & PAGE_MASK;
+ offset = addr & ~PAGE_MASK;
+ end = PAGE_ALIGN(addr + size);
+ npages = (end - start) >> PAGE_SHIFT;
+
+ userptr->size = size;
+ userptr->addr = addr;
+ userptr->dma_mapped = false;
+ INIT_LIST_HEAD(&userptr->job_node);
+
+ userptr->vec = frame_vector_create(npages);
+ if (!userptr->vec) {
+ dev_err(hdev->dev, "Failed to create frame vector\n");
+ return -ENOMEM;
+ }
+
+ rc = get_vaddr_frames(start, npages, FOLL_FORCE | FOLL_WRITE,
+ userptr->vec);
+
+ if (rc != npages) {
+ dev_err(hdev->dev,
+ "Failed to map host memory, user ptr probably wrong\n");
+ if (rc < 0)
+ goto destroy_framevec;
+ rc = -EFAULT;
+ goto put_framevec;
+ }
+
+ if (frame_vector_to_pages(userptr->vec) < 0) {
+ dev_err(hdev->dev,
+ "Failed to translate frame vector to pages\n");
+ rc = -EFAULT;
+ goto put_framevec;
+ }
+
+ userptr->sgt = kzalloc(sizeof(*userptr->sgt), GFP_ATOMIC);
+ if (!userptr->sgt) {
+ rc = -ENOMEM;
+ goto put_framevec;
+ }
+
+ rc = sg_alloc_table_from_pages(userptr->sgt,
+ frame_vector_pages(userptr->vec),
+ npages, offset, size, GFP_ATOMIC);
+ if (rc < 0) {
+ dev_err(hdev->dev, "failed to create SG table from pages\n");
+ goto free_sgt;
+ }
+
+ return 0;
+
+free_sgt:
+ kfree(userptr->sgt);
+put_framevec:
+ put_vaddr_frames(userptr->vec);
+destroy_framevec:
+ frame_vector_destroy(userptr->vec);
+ return rc;
+}
+
+/*
+ * hl_unpin_host_memory - unpins a chunk of host memory
+ *
+ * @hdev : pointer to the habanalabs device structure
+ * @userptr : pointer to hl_userptr structure
+ *
+ * This function does the following:
+ * - Unpins the physical pages related to the host memory
+ * - Free the SG list
+ */
+int hl_unpin_host_memory(struct hl_device *hdev, struct hl_userptr *userptr)
+{
+ struct page **pages;
+
+ if (userptr->dma_mapped)
+ hdev->asic_funcs->hl_dma_unmap_sg(hdev,
+ userptr->sgt->sgl,
+ userptr->sgt->nents,
+ userptr->dir);
+
+ pages = frame_vector_pages(userptr->vec);
+ if (!IS_ERR(pages)) {
+ int i;
+
+ for (i = 0; i < frame_vector_count(userptr->vec); i++)
+ set_page_dirty_lock(pages[i]);
+ }
+ put_vaddr_frames(userptr->vec);
+ frame_vector_destroy(userptr->vec);
+
+ list_del(&userptr->job_node);
+
+ sg_free_table(userptr->sgt);
+ kfree(userptr->sgt);
+
+ return 0;
+}
+
+/*
+ * hl_userptr_delete_list - clear userptr list
+ *
+ * @hdev : pointer to the habanalabs device structure
+ * @userptr_list : pointer to the list to clear
+ *
+ * This function does the following:
+ * - Iterates over the list and unpins the host memory and frees the userptr
+ * structure.
+ */
+void hl_userptr_delete_list(struct hl_device *hdev,
+ struct list_head *userptr_list)
+{
+ struct hl_userptr *userptr, *tmp;
+
+ list_for_each_entry_safe(userptr, tmp, userptr_list, job_node) {
+ hl_unpin_host_memory(hdev, userptr);
+ kfree(userptr);
+ }
+
+ INIT_LIST_HEAD(userptr_list);
+}
+
+/*
+ * hl_userptr_is_pinned - returns whether the given userptr is pinned
+ *
+ * @hdev : pointer to the habanalabs device structure
+ * @userptr_list : pointer to the list to clear
+ * @userptr : pointer to userptr to check
+ *
+ * This function does the following:
+ * - Iterates over the list and checks if the given userptr is in it, means is
+ * pinned. If so, returns true, otherwise returns false.
+ */
+bool hl_userptr_is_pinned(struct hl_device *hdev, u64 addr,
+ u32 size, struct list_head *userptr_list,
+ struct hl_userptr **userptr)
+{
+ list_for_each_entry((*userptr), userptr_list, job_node) {
+ if ((addr == (*userptr)->addr) && (size == (*userptr)->size))
+ return true;
+ }
+
+ return false;
+}
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 756266cf0416..fba49417f607 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -74,6 +74,95 @@ union hl_cb_args {
};
/*
+ * This structure size must always be fixed to 64-bytes for backward
+ * compatibility
+ */
+struct hl_cs_chunk {
+ /*
+ * For external queue, this represents a Handle of CB on the Host
+ * For internal queue, this represents an SRAM or DRAM address of the
+ * internal CB
+ */
+ __u64 cb_handle;
+ /* Index of queue to put the CB on */
+ __u32 queue_index;
+ /*
+ * Size of command buffer with valid packets
+ * Can be smaller then actual CB size
+ */
+ __u32 cb_size;
+ /* HL_CS_CHUNK_FLAGS_* */
+ __u32 cs_chunk_flags;
+ /* Align structure to 64 bytes */
+ __u32 pad[11];
+};
+
+#define HL_CS_FLAGS_FORCE_RESTORE 0x1
+
+#define HL_CS_STATUS_SUCCESS 0
+
+struct hl_cs_in {
+ /* this holds address of array of hl_cs_chunk for restore phase */
+ __u64 chunks_restore;
+ /* this holds address of array of hl_cs_chunk for execution phase */
+ __u64 chunks_execute;
+ /* this holds address of array of hl_cs_chunk for store phase -
+ * Currently not in use
+ */
+ __u64 chunks_store;
+ /* Number of chunks in restore phase array */
+ __u32 num_chunks_restore;
+ /* Number of chunks in execution array */
+ __u32 num_chunks_execute;
+ /* Number of chunks in restore phase array - Currently not in use */
+ __u32 num_chunks_store;
+ /* HL_CS_FLAGS_* */
+ __u32 cs_flags;
+ /* Context ID - Currently not in use */
+ __u32 ctx_id;
+};
+
+struct hl_cs_out {
+ /* this holds the sequence number of the CS to pass to wait ioctl */
+ __u64 seq;
+ /* HL_CS_STATUS_* */
+ __u32 status;
+ __u32 pad;
+};
+
+union hl_cs_args {
+ struct hl_cs_in in;
+ struct hl_cs_out out;
+};
+
+struct hl_wait_cs_in {
+ /* Command submission sequence number */
+ __u64 seq;
+ /* Absolute timeout to wait in microseconds */
+ __u64 timeout_us;
+ /* Context ID - Currently not in use */
+ __u32 ctx_id;
+ __u32 pad;
+};
+
+#define HL_WAIT_CS_STATUS_COMPLETED 0
+#define HL_WAIT_CS_STATUS_BUSY 1
+#define HL_WAIT_CS_STATUS_TIMEDOUT 2
+#define HL_WAIT_CS_STATUS_ABORTED 3
+#define HL_WAIT_CS_STATUS_INTERRUPTED 4
+
+struct hl_wait_cs_out {
+ /* HL_WAIT_CS_STATUS_* */
+ __u32 status;
+ __u32 pad;
+};
+
+union hl_wait_cs_args {
+ struct hl_wait_cs_in in;
+ struct hl_wait_cs_out out;
+};
+
+/*
* Command Buffer
* - Request a Command Buffer
* - Destroy a Command Buffer
@@ -89,7 +178,74 @@ union hl_cb_args {
#define HL_IOCTL_CB \
_IOWR('H', 0x02, union hl_cb_args)
+/*
+ * Command Submission
+ *
+ * To submit work to the device, the user need to call this IOCTL with a set
+ * of JOBS. That set of JOBS constitutes a CS object.
+ * Each JOB will be enqueued on a specific queue, according to the user's input.
+ * There can be more then one JOB per queue.
+ *
+ * There are two types of queues - external and internal. External queues
+ * are DMA queues which transfer data from/to the Host. All other queues are
+ * internal. The driver will get completion notifications from the device only
+ * on JOBS which are enqueued in the external queues.
+ *
+ * This IOCTL is asynchronous in regard to the actual execution of the CS. This
+ * means it returns immediately after ALL the JOBS were enqueued on their
+ * relevant queues. Therefore, the user mustn't assume the CS has been completed
+ * or has even started to execute.
+ *
+ * Upon successful enqueue, the IOCTL returns an opaque handle which the user
+ * can use with the "Wait for CS" IOCTL to check whether the handle's CS
+ * external JOBS have been completed. Note that if the CS has internal JOBS
+ * which can execute AFTER the external JOBS have finished, the driver might
+ * report that the CS has finished executing BEFORE the internal JOBS have
+ * actually finish executing.
+ *
+ * The CS IOCTL will receive three sets of JOBS. One set is for "restore" phase,
+ * a second set is for "execution" phase and a third set is for "store" phase.
+ * The JOBS on the "restore" phase are enqueued only after context-switch
+ * (or if its the first CS for this context). The user can also order the
+ * driver to run the "restore" phase explicitly
+ *
+ */
+#define HL_IOCTL_CS \
+ _IOWR('H', 0x03, union hl_cs_args)
+
+/*
+ * Wait for Command Submission
+ *
+ * The user can call this IOCTL with a handle it received from the CS IOCTL
+ * to wait until the handle's CS has finished executing. The user will wait
+ * inside the kernel until the CS has finished or until the user-requeusted
+ * timeout has expired.
+ *
+ * The return value of the IOCTL is a standard Linux error code. The possible
+ * values are:
+ *
+ * EINTR - Kernel waiting has been interrupted, e.g. due to OS signal
+ * that the user process received
+ * ETIMEDOUT - The CS has caused a timeout on the device
+ * EIO - The CS was aborted (usually because the device was reset)
+ * ENODEV - The device wants to do hard-reset (so user need to close FD)
+ *
+ * The driver also returns a custom define inside the IOCTL which can be:
+ *
+ * HL_WAIT_CS_STATUS_COMPLETED - The CS has been completed successfully (0)
+ * HL_WAIT_CS_STATUS_BUSY - The CS is still executing (0)
+ * HL_WAIT_CS_STATUS_TIMEDOUT - The CS has caused a timeout on the device
+ * (ETIMEDOUT)
+ * HL_WAIT_CS_STATUS_ABORTED - The CS was aborted, usually because the
+ * device was reset (EIO)
+ * HL_WAIT_CS_STATUS_INTERRUPTED - Waiting for the CS was interrupted (EINTR)
+ *
+ */
+
+#define HL_IOCTL_WAIT_CS \
+ _IOWR('H', 0x04, union hl_wait_cs_args)
+
#define HL_COMMAND_START 0x02
-#define HL_COMMAND_END 0x03
+#define HL_COMMAND_END 0x05
#endif /* HABANALABS_H_ */