summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>2019-08-12 16:51:07 +0200
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2019-08-12 16:51:07 +0200
commit7ea33253e910647876c9ccab720b662ae240a157 (patch)
tree8647b3947a596decaa7ed745cde8fc0f3c2c9d96
parentd45331b00ddb179e291766617259261c112db872 (diff)
parentb421d83a3947369fd5718824aecfaebe1efbf7ed (diff)
Merge tag 'misc-habanalabs-fixes-2019-08-12' of git://people.freedesktop.org/~gabbayo/linux into char-misc-next
Oded writes: This tag contains a couple of important fixes: - Four fixes when running on s390 architecture (BE). With these fixes, the driver is fully functional on Big-endian architectures. The fixes include: - Validation/Patching of user packets - Completion queue handling - Internal H/W queues submission - Device IRQ unmasking operation - Fix to double free in an error path to avoid kernel corruption - Fix to DRAM usage accounting when a user process is terminated forcefully. * tag 'misc-habanalabs-fixes-2019-08-12' of git://people.freedesktop.org/~gabbayo/linux: habanalabs: fix device IRQ unmasking for BE host habanalabs: fix endianness handling for internal QMAN submission habanalabs: fix completion queue handling when host is BE habanalabs: fix endianness handling for packets from user habanalabs: fix DRAM usage accounting on context tear down habanalabs: Avoid double free in error flow
-rw-r--r--drivers/misc/habanalabs/device.c5
-rw-r--r--drivers/misc/habanalabs/goya/goya.c72
-rw-r--r--drivers/misc/habanalabs/goya/goyaP.h2
-rw-r--r--drivers/misc/habanalabs/habanalabs.h9
-rw-r--r--drivers/misc/habanalabs/hw_queue.c14
-rw-r--r--drivers/misc/habanalabs/include/goya/goya_packets.h13
-rw-r--r--drivers/misc/habanalabs/irq.c27
-rw-r--r--drivers/misc/habanalabs/memory.c2
8 files changed, 90 insertions, 54 deletions
diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c
index 0c4894dd9c02..7a8f9d0b71b5 100644
--- a/drivers/misc/habanalabs/device.c
+++ b/drivers/misc/habanalabs/device.c
@@ -970,7 +970,8 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
rc = hl_ctx_init(hdev, hdev->kernel_ctx, true);
if (rc) {
dev_err(hdev->dev, "failed to initialize kernel context\n");
- goto free_ctx;
+ kfree(hdev->kernel_ctx);
+ goto mmu_fini;
}
rc = hl_cb_pool_init(hdev);
@@ -1053,8 +1054,6 @@ release_ctx:
if (hl_ctx_put(hdev->kernel_ctx) != 1)
dev_err(hdev->dev,
"kernel ctx is still alive on initialization failure\n");
-free_ctx:
- kfree(hdev->kernel_ctx);
mmu_fini:
hl_mmu_fini(hdev);
eq_fini:
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index a0e181714891..271c5c8f53b4 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -2729,9 +2729,10 @@ void goya_ring_doorbell(struct hl_device *hdev, u32 hw_queue_id, u32 pi)
GOYA_ASYNC_EVENT_ID_PI_UPDATE);
}
-void goya_flush_pq_write(struct hl_device *hdev, u64 *pq, u64 exp_val)
+void goya_pqe_write(struct hl_device *hdev, __le64 *pqe, struct hl_bd *bd)
{
- /* Not needed in Goya */
+ /* The QMANs are on the SRAM so need to copy to IO space */
+ memcpy_toio((void __iomem *) pqe, bd, sizeof(struct hl_bd));
}
static void *goya_dma_alloc_coherent(struct hl_device *hdev, size_t size,
@@ -3313,9 +3314,11 @@ static int goya_validate_dma_pkt_no_mmu(struct hl_device *hdev,
int rc;
dev_dbg(hdev->dev, "DMA packet details:\n");
- dev_dbg(hdev->dev, "source == 0x%llx\n", user_dma_pkt->src_addr);
- dev_dbg(hdev->dev, "destination == 0x%llx\n", user_dma_pkt->dst_addr);
- dev_dbg(hdev->dev, "size == %u\n", user_dma_pkt->tsize);
+ dev_dbg(hdev->dev, "source == 0x%llx\n",
+ le64_to_cpu(user_dma_pkt->src_addr));
+ dev_dbg(hdev->dev, "destination == 0x%llx\n",
+ le64_to_cpu(user_dma_pkt->dst_addr));
+ dev_dbg(hdev->dev, "size == %u\n", le32_to_cpu(user_dma_pkt->tsize));
ctl = le32_to_cpu(user_dma_pkt->ctl);
user_dir = (ctl & GOYA_PKT_LIN_DMA_CTL_DMA_DIR_MASK) >>
@@ -3344,9 +3347,11 @@ static int goya_validate_dma_pkt_mmu(struct hl_device *hdev,
struct packet_lin_dma *user_dma_pkt)
{
dev_dbg(hdev->dev, "DMA packet details:\n");
- dev_dbg(hdev->dev, "source == 0x%llx\n", user_dma_pkt->src_addr);
- dev_dbg(hdev->dev, "destination == 0x%llx\n", user_dma_pkt->dst_addr);
- dev_dbg(hdev->dev, "size == %u\n", user_dma_pkt->tsize);
+ dev_dbg(hdev->dev, "source == 0x%llx\n",
+ le64_to_cpu(user_dma_pkt->src_addr));
+ dev_dbg(hdev->dev, "destination == 0x%llx\n",
+ le64_to_cpu(user_dma_pkt->dst_addr));
+ dev_dbg(hdev->dev, "size == %u\n", le32_to_cpu(user_dma_pkt->tsize));
/*
* WA for HW-23.
@@ -3386,7 +3391,8 @@ static int goya_validate_wreg32(struct hl_device *hdev,
dev_dbg(hdev->dev, "WREG32 packet details:\n");
dev_dbg(hdev->dev, "reg_offset == 0x%x\n", reg_offset);
- dev_dbg(hdev->dev, "value == 0x%x\n", wreg_pkt->value);
+ dev_dbg(hdev->dev, "value == 0x%x\n",
+ le32_to_cpu(wreg_pkt->value));
if (reg_offset != (mmDMA_CH_0_WR_COMP_ADDR_LO & 0x1FFF)) {
dev_err(hdev->dev, "WREG32 packet with illegal address 0x%x\n",
@@ -3428,12 +3434,13 @@ static int goya_validate_cb(struct hl_device *hdev,
while (cb_parsed_length < parser->user_cb_size) {
enum packet_id pkt_id;
u16 pkt_size;
- void *user_pkt;
+ struct goya_packet *user_pkt;
- user_pkt = (void *) (uintptr_t)
+ user_pkt = (struct goya_packet *) (uintptr_t)
(parser->user_cb->kernel_address + cb_parsed_length);
- pkt_id = (enum packet_id) (((*(u64 *) user_pkt) &
+ pkt_id = (enum packet_id) (
+ (le64_to_cpu(user_pkt->header) &
PACKET_HEADER_PACKET_ID_MASK) >>
PACKET_HEADER_PACKET_ID_SHIFT);
@@ -3453,7 +3460,8 @@ static int goya_validate_cb(struct hl_device *hdev,
* need to validate here as well because patch_cb() is
* not called in MMU path while this function is called
*/
- rc = goya_validate_wreg32(hdev, parser, user_pkt);
+ rc = goya_validate_wreg32(hdev,
+ parser, (struct packet_wreg32 *) user_pkt);
break;
case PACKET_WREG_BULK:
@@ -3481,10 +3489,10 @@ static int goya_validate_cb(struct hl_device *hdev,
case PACKET_LIN_DMA:
if (is_mmu)
rc = goya_validate_dma_pkt_mmu(hdev, parser,
- user_pkt);
+ (struct packet_lin_dma *) user_pkt);
else
rc = goya_validate_dma_pkt_no_mmu(hdev, parser,
- user_pkt);
+ (struct packet_lin_dma *) user_pkt);
break;
case PACKET_MSG_LONG:
@@ -3657,15 +3665,16 @@ static int goya_patch_cb(struct hl_device *hdev,
enum packet_id pkt_id;
u16 pkt_size;
u32 new_pkt_size = 0;
- void *user_pkt, *kernel_pkt;
+ struct goya_packet *user_pkt, *kernel_pkt;
- user_pkt = (void *) (uintptr_t)
+ user_pkt = (struct goya_packet *) (uintptr_t)
(parser->user_cb->kernel_address + cb_parsed_length);
- kernel_pkt = (void *) (uintptr_t)
+ kernel_pkt = (struct goya_packet *) (uintptr_t)
(parser->patched_cb->kernel_address +
cb_patched_cur_length);
- pkt_id = (enum packet_id) (((*(u64 *) user_pkt) &
+ pkt_id = (enum packet_id) (
+ (le64_to_cpu(user_pkt->header) &
PACKET_HEADER_PACKET_ID_MASK) >>
PACKET_HEADER_PACKET_ID_SHIFT);
@@ -3680,15 +3689,18 @@ static int goya_patch_cb(struct hl_device *hdev,
switch (pkt_id) {
case PACKET_LIN_DMA:
- rc = goya_patch_dma_packet(hdev, parser, user_pkt,
- kernel_pkt, &new_pkt_size);
+ rc = goya_patch_dma_packet(hdev, parser,
+ (struct packet_lin_dma *) user_pkt,
+ (struct packet_lin_dma *) kernel_pkt,
+ &new_pkt_size);
cb_patched_cur_length += new_pkt_size;
break;
case PACKET_WREG_32:
memcpy(kernel_pkt, user_pkt, pkt_size);
cb_patched_cur_length += pkt_size;
- rc = goya_validate_wreg32(hdev, parser, kernel_pkt);
+ rc = goya_validate_wreg32(hdev, parser,
+ (struct packet_wreg32 *) kernel_pkt);
break;
case PACKET_WREG_BULK:
@@ -4352,6 +4364,8 @@ static int goya_unmask_irq_arr(struct hl_device *hdev, u32 *irq_arr,
size_t total_pkt_size;
long result;
int rc;
+ int irq_num_entries, irq_arr_index;
+ __le32 *goya_irq_arr;
total_pkt_size = sizeof(struct armcp_unmask_irq_arr_packet) +
irq_arr_size;
@@ -4369,8 +4383,16 @@ static int goya_unmask_irq_arr(struct hl_device *hdev, u32 *irq_arr,
if (!pkt)
return -ENOMEM;
- pkt->length = cpu_to_le32(irq_arr_size / sizeof(irq_arr[0]));
- memcpy(&pkt->irqs, irq_arr, irq_arr_size);
+ irq_num_entries = irq_arr_size / sizeof(irq_arr[0]);
+ pkt->length = cpu_to_le32(irq_num_entries);
+
+ /* We must perform any necessary endianness conversation on the irq
+ * array being passed to the goya hardware
+ */
+ for (irq_arr_index = 0, goya_irq_arr = (__le32 *) &pkt->irqs;
+ irq_arr_index < irq_num_entries ; irq_arr_index++)
+ goya_irq_arr[irq_arr_index] =
+ cpu_to_le32(irq_arr[irq_arr_index]);
pkt->armcp_pkt.ctl = cpu_to_le32(ARMCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY <<
ARMCP_PKT_CTL_OPCODE_SHIFT);
@@ -5042,7 +5064,7 @@ static const struct hl_asic_funcs goya_funcs = {
.resume = goya_resume,
.cb_mmap = goya_cb_mmap,
.ring_doorbell = goya_ring_doorbell,
- .flush_pq_write = goya_flush_pq_write,
+ .pqe_write = goya_pqe_write,
.asic_dma_alloc_coherent = goya_dma_alloc_coherent,
.asic_dma_free_coherent = goya_dma_free_coherent,
.get_int_queue_base = goya_get_int_queue_base,
diff --git a/drivers/misc/habanalabs/goya/goyaP.h b/drivers/misc/habanalabs/goya/goyaP.h
index f8c611883dc1..d7f48c9c41cd 100644
--- a/drivers/misc/habanalabs/goya/goyaP.h
+++ b/drivers/misc/habanalabs/goya/goyaP.h
@@ -177,7 +177,7 @@ int goya_late_init(struct hl_device *hdev);
void goya_late_fini(struct hl_device *hdev);
void goya_ring_doorbell(struct hl_device *hdev, u32 hw_queue_id, u32 pi);
-void goya_flush_pq_write(struct hl_device *hdev, u64 *pq, u64 exp_val);
+void goya_pqe_write(struct hl_device *hdev, __le64 *pqe, struct hl_bd *bd);
void goya_update_eq_ci(struct hl_device *hdev, u32 val);
void goya_restore_phase_topology(struct hl_device *hdev);
int goya_context_switch(struct hl_device *hdev, u32 asid);
diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index 6a4c64b97f38..ce83adafcf2d 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -441,7 +441,11 @@ enum hl_pll_frequency {
* @resume: handles IP specific H/W or SW changes for resume.
* @cb_mmap: maps a CB.
* @ring_doorbell: increment PI on a given QMAN.
- * @flush_pq_write: flush PQ entry write if necessary, WARN if flushing failed.
+ * @pqe_write: Write the PQ entry to the PQ. This is ASIC-specific
+ * function because the PQs are located in different memory areas
+ * per ASIC (SRAM, DRAM, Host memory) and therefore, the method of
+ * writing the PQE must match the destination memory area
+ * properties.
* @asic_dma_alloc_coherent: Allocate coherent DMA memory by calling
* dma_alloc_coherent(). This is ASIC function because
* its implementation is not trivial when the driver
@@ -510,7 +514,8 @@ struct hl_asic_funcs {
int (*cb_mmap)(struct hl_device *hdev, struct vm_area_struct *vma,
u64 kaddress, phys_addr_t paddress, u32 size);
void (*ring_doorbell)(struct hl_device *hdev, u32 hw_queue_id, u32 pi);
- void (*flush_pq_write)(struct hl_device *hdev, u64 *pq, u64 exp_val);
+ void (*pqe_write)(struct hl_device *hdev, __le64 *pqe,
+ struct hl_bd *bd);
void* (*asic_dma_alloc_coherent)(struct hl_device *hdev, size_t size,
dma_addr_t *dma_handle, gfp_t flag);
void (*asic_dma_free_coherent)(struct hl_device *hdev, size_t size,
diff --git a/drivers/misc/habanalabs/hw_queue.c b/drivers/misc/habanalabs/hw_queue.c
index e3b5517897ea..5f5673b74985 100644
--- a/drivers/misc/habanalabs/hw_queue.c
+++ b/drivers/misc/habanalabs/hw_queue.c
@@ -290,23 +290,19 @@ static void int_hw_queue_schedule_job(struct hl_cs_job *job)
struct hl_device *hdev = job->cs->ctx->hdev;
struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id];
struct hl_bd bd;
- u64 *pi, *pbd = (u64 *) &bd;
+ __le64 *pi;
bd.ctl = 0;
- bd.len = __cpu_to_le32(job->job_cb_size);
- bd.ptr = __cpu_to_le64((u64) (uintptr_t) job->user_cb);
+ bd.len = cpu_to_le32(job->job_cb_size);
+ bd.ptr = cpu_to_le64((u64) (uintptr_t) job->user_cb);
- pi = (u64 *) (uintptr_t) (q->kernel_address +
+ pi = (__le64 *) (uintptr_t) (q->kernel_address +
((q->pi & (q->int_queue_len - 1)) * sizeof(bd)));
- pi[0] = pbd[0];
- pi[1] = pbd[1];
-
q->pi++;
q->pi &= ((q->int_queue_len << 1) - 1);
- /* Flush PQ entry write. Relevant only for specific ASICs */
- hdev->asic_funcs->flush_pq_write(hdev, pi, pbd[0]);
+ hdev->asic_funcs->pqe_write(hdev, pi, &bd);
hdev->asic_funcs->ring_doorbell(hdev, q->hw_queue_id, q->pi);
}
diff --git a/drivers/misc/habanalabs/include/goya/goya_packets.h b/drivers/misc/habanalabs/include/goya/goya_packets.h
index a14407b975e4..ef54bad20509 100644
--- a/drivers/misc/habanalabs/include/goya/goya_packets.h
+++ b/drivers/misc/habanalabs/include/goya/goya_packets.h
@@ -52,6 +52,19 @@ enum goya_dma_direction {
#define GOYA_PKT_CTL_MB_SHIFT 31
#define GOYA_PKT_CTL_MB_MASK 0x80000000
+/* All packets have, at least, an 8-byte header, which contains
+ * the packet type. The kernel driver uses the packet header for packet
+ * validation and to perform any necessary required preparation before
+ * sending them off to the hardware.
+ */
+struct goya_packet {
+ __le64 header;
+ /* The rest of the packet data follows. Use the corresponding
+ * packet_XXX struct to deference the data, based on packet type
+ */
+ u8 contents[0];
+};
+
struct packet_nop {
__le32 reserved;
__le32 ctl;
diff --git a/drivers/misc/habanalabs/irq.c b/drivers/misc/habanalabs/irq.c
index ea9f72ff456c..199791b57caf 100644
--- a/drivers/misc/habanalabs/irq.c
+++ b/drivers/misc/habanalabs/irq.c
@@ -80,8 +80,7 @@ irqreturn_t hl_irq_handler_cq(int irq, void *arg)
struct hl_cs_job *job;
bool shadow_index_valid;
u16 shadow_index;
- u32 *cq_entry;
- u32 *cq_base;
+ struct hl_cq_entry *cq_entry, *cq_base;
if (hdev->disabled) {
dev_dbg(hdev->dev,
@@ -90,29 +89,29 @@ irqreturn_t hl_irq_handler_cq(int irq, void *arg)
return IRQ_HANDLED;
}
- cq_base = (u32 *) (uintptr_t) cq->kernel_address;
+ cq_base = (struct hl_cq_entry *) (uintptr_t) cq->kernel_address;
while (1) {
- bool entry_ready = ((cq_base[cq->ci] & CQ_ENTRY_READY_MASK)
+ bool entry_ready = ((le32_to_cpu(cq_base[cq->ci].data) &
+ CQ_ENTRY_READY_MASK)
>> CQ_ENTRY_READY_SHIFT);
if (!entry_ready)
break;
- cq_entry = (u32 *) &cq_base[cq->ci];
+ cq_entry = (struct hl_cq_entry *) &cq_base[cq->ci];
- /*
- * Make sure we read CQ entry contents after we've
+ /* Make sure we read CQ entry contents after we've
* checked the ownership bit.
*/
dma_rmb();
- shadow_index_valid =
- ((*cq_entry & CQ_ENTRY_SHADOW_INDEX_VALID_MASK)
+ shadow_index_valid = ((le32_to_cpu(cq_entry->data) &
+ CQ_ENTRY_SHADOW_INDEX_VALID_MASK)
>> CQ_ENTRY_SHADOW_INDEX_VALID_SHIFT);
- shadow_index = (u16)
- ((*cq_entry & CQ_ENTRY_SHADOW_INDEX_MASK)
+ shadow_index = (u16) ((le32_to_cpu(cq_entry->data) &
+ CQ_ENTRY_SHADOW_INDEX_MASK)
>> CQ_ENTRY_SHADOW_INDEX_SHIFT);
queue = &hdev->kernel_queues[cq->hw_queue_id];
@@ -122,8 +121,7 @@ irqreturn_t hl_irq_handler_cq(int irq, void *arg)
queue_work(hdev->cq_wq, &job->finish_work);
}
- /*
- * Update ci of the context's queue. There is no
+ /* Update ci of the context's queue. There is no
* need to protect it with spinlock because this update is
* done only inside IRQ and there is a different IRQ per
* queue
@@ -131,7 +129,8 @@ irqreturn_t hl_irq_handler_cq(int irq, void *arg)
queue->ci = hl_queue_inc_ptr(queue->ci);
/* Clear CQ entry ready bit */
- cq_base[cq->ci] &= ~CQ_ENTRY_READY_MASK;
+ cq_entry->data = cpu_to_le32(le32_to_cpu(cq_entry->data) &
+ ~CQ_ENTRY_READY_MASK);
cq->ci = hl_cq_inc_ptr(cq->ci);
diff --git a/drivers/misc/habanalabs/memory.c b/drivers/misc/habanalabs/memory.c
index 42d237cae1dc..365fb0cb8dff 100644
--- a/drivers/misc/habanalabs/memory.c
+++ b/drivers/misc/habanalabs/memory.c
@@ -1629,6 +1629,8 @@ void hl_vm_ctx_fini(struct hl_ctx *ctx)
dev_dbg(hdev->dev,
"page list 0x%p of asid %d is still alive\n",
phys_pg_list, ctx->asid);
+ atomic64_sub(phys_pg_list->total_size,
+ &hdev->dram_used_mem);
free_phys_pg_pack(hdev, phys_pg_list);
idr_remove(&vm->phys_pg_pack_handles, i);
}