diff options
Diffstat (limited to 'drivers/misc/habanalabs/goya/goya.c')
-rw-r--r-- | drivers/misc/habanalabs/goya/goya.c | 1126 |
1 files changed, 1126 insertions, 0 deletions
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 1fe1d6a1ff9e..e3878fd7dc94 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -95,6 +95,19 @@ static const char goya_irq_name[GOYA_MSIX_ENTRIES][GOYA_MAX_STRING_LEN] = { "goya cq 4", "goya cpu eq" }; +static u16 goya_packet_sizes[MAX_PACKET_ID] = { + [PACKET_WREG_32] = sizeof(struct packet_wreg32), + [PACKET_WREG_BULK] = sizeof(struct packet_wreg_bulk), + [PACKET_MSG_LONG] = sizeof(struct packet_msg_long), + [PACKET_MSG_SHORT] = sizeof(struct packet_msg_short), + [PACKET_CP_DMA] = sizeof(struct packet_cp_dma), + [PACKET_MSG_PROT] = sizeof(struct packet_msg_prot), + [PACKET_FENCE] = sizeof(struct packet_fence), + [PACKET_LIN_DMA] = sizeof(struct packet_lin_dma), + [PACKET_NOP] = sizeof(struct packet_nop), + [PACKET_STOP] = sizeof(struct packet_stop) +}; + static const char *goya_axi_name[GOYA_MAX_INITIATORS] = { "MME0", "MME1", @@ -2978,6 +2991,84 @@ void *goya_get_int_queue_base(struct hl_device *hdev, u32 queue_id, return base; } +int goya_send_job_on_qman0(struct hl_device *hdev, struct hl_cs_job *job) +{ + struct goya_device *goya = hdev->asic_specific; + struct packet_msg_prot *fence_pkt; + u32 *fence_ptr; + dma_addr_t fence_dma_addr; + struct hl_cb *cb; + u32 tmp; + int rc; + + if (!hdev->asic_funcs->is_device_idle(hdev)) { + dev_err_ratelimited(hdev->dev, + "Can't send KMD job on QMAN0 if device is not idle\n"); + return -EFAULT; + } + + fence_ptr = hdev->asic_funcs->dma_pool_zalloc(hdev, 4, GFP_KERNEL, + &fence_dma_addr); + if (!fence_ptr) { + dev_err(hdev->dev, + "Failed to allocate fence memory for QMAN0\n"); + return -ENOMEM; + } + + *fence_ptr = 0; + + if (goya->hw_cap_initialized & HW_CAP_MMU) { + WREG32(mmDMA_QM_0_GLBL_PROT, QMAN_DMA_FULLY_TRUSTED); + RREG32(mmDMA_QM_0_GLBL_PROT); + } + + /* + * goya cs parser saves space for 2xpacket_msg_prot at end of CB. For + * synchronized kernel jobs we only need space for 1 packet_msg_prot + */ + job->job_cb_size -= sizeof(struct packet_msg_prot); + + cb = job->patched_cb; + + fence_pkt = (struct packet_msg_prot *) (uintptr_t) (cb->kernel_address + + job->job_cb_size - sizeof(struct packet_msg_prot)); + + fence_pkt->ctl = (PACKET_MSG_PROT << GOYA_PKT_CTL_OPCODE_SHIFT) | + (1 << GOYA_PKT_CTL_EB_SHIFT) | + (1 << GOYA_PKT_CTL_MB_SHIFT); + fence_pkt->value = GOYA_QMAN0_FENCE_VAL; + fence_pkt->addr = fence_dma_addr + + hdev->asic_prop.host_phys_base_address; + + rc = hl_hw_queue_send_cb_no_cmpl(hdev, GOYA_QUEUE_ID_DMA_0, + job->job_cb_size, cb->bus_address); + if (rc) { + dev_err(hdev->dev, "Failed to send CB on QMAN0, %d\n", rc); + goto free_fence_ptr; + } + + rc = hl_poll_timeout_memory(hdev, (u64) (uintptr_t) fence_ptr, + HL_DEVICE_TIMEOUT_USEC, &tmp); + + hl_hw_queue_inc_ci_kernel(hdev, GOYA_QUEUE_ID_DMA_0); + + if ((rc) || (tmp != GOYA_QMAN0_FENCE_VAL)) { + dev_err(hdev->dev, "QMAN0 Job hasn't finished in time\n"); + rc = -ETIMEDOUT; + } + +free_fence_ptr: + hdev->asic_funcs->dma_pool_free(hdev, (void *) fence_ptr, + fence_dma_addr); + + if (goya->hw_cap_initialized & HW_CAP_MMU) { + WREG32(mmDMA_QM_0_GLBL_PROT, QMAN_DMA_PARTLY_TRUSTED); + RREG32(mmDMA_QM_0_GLBL_PROT); + } + + return rc; +} + int goya_send_cpu_message(struct hl_device *hdev, u32 *msg, u16 len, u32 timeout, long *result) { @@ -3214,11 +3305,985 @@ void goya_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size, size); } +int goya_dma_map_sg(struct hl_device *hdev, struct scatterlist *sg, int nents, + enum dma_data_direction dir) +{ + if (!dma_map_sg(&hdev->pdev->dev, sg, nents, dir)) + return -ENOMEM; + + return 0; +} + +void goya_dma_unmap_sg(struct hl_device *hdev, struct scatterlist *sg, + int nents, enum dma_data_direction dir) +{ + dma_unmap_sg(&hdev->pdev->dev, sg, nents, dir); +} + +u32 goya_get_dma_desc_list_size(struct hl_device *hdev, + struct sg_table *sgt) +{ + struct scatterlist *sg, *sg_next_iter; + u32 count, len, dma_desc_cnt, len_next; + dma_addr_t addr, addr_next; + + dma_desc_cnt = 0; + + for_each_sg(sgt->sgl, sg, sgt->nents, count) { + + len = sg_dma_len(sg); + addr = sg_dma_address(sg); + + if (len == 0) + break; + + while ((count + 1) < sgt->nents) { + sg_next_iter = sg_next(sg); + len_next = sg_dma_len(sg_next_iter); + addr_next = sg_dma_address(sg_next_iter); + + if (len_next == 0) + break; + + if ((addr + len == addr_next) && + (len + len_next <= DMA_MAX_TRANSFER_SIZE)) { + len += len_next; + count++; + sg = sg_next_iter; + } else { + break; + } + } + + dma_desc_cnt++; + } + + return dma_desc_cnt * sizeof(struct packet_lin_dma); +} + +static int goya_pin_memory_before_cs(struct hl_device *hdev, + struct hl_cs_parser *parser, + struct packet_lin_dma *user_dma_pkt, + u64 addr, enum dma_data_direction dir) +{ + struct hl_userptr *userptr; + int rc; + + if (hl_userptr_is_pinned(hdev, addr, user_dma_pkt->tsize, + parser->job_userptr_list, &userptr)) + goto already_pinned; + + userptr = kzalloc(sizeof(*userptr), GFP_ATOMIC); + if (!userptr) + return -ENOMEM; + + rc = hl_pin_host_memory(hdev, addr, user_dma_pkt->tsize, userptr); + if (rc) + goto free_userptr; + + list_add_tail(&userptr->job_node, parser->job_userptr_list); + + rc = hdev->asic_funcs->asic_dma_map_sg(hdev, userptr->sgt->sgl, + userptr->sgt->nents, dir); + if (rc) { + dev_err(hdev->dev, "failed to map sgt with DMA region\n"); + goto unpin_memory; + } + + userptr->dma_mapped = true; + userptr->dir = dir; + +already_pinned: + parser->patched_cb_size += + goya_get_dma_desc_list_size(hdev, userptr->sgt); + + return 0; + +unpin_memory: + hl_unpin_host_memory(hdev, userptr); +free_userptr: + kfree(userptr); + return rc; +} + +static int goya_validate_dma_pkt_host(struct hl_device *hdev, + struct hl_cs_parser *parser, + struct packet_lin_dma *user_dma_pkt) +{ + u64 device_memory_addr, addr; + enum dma_data_direction dir; + enum goya_dma_direction user_dir; + bool sram_addr = true; + bool skip_host_mem_pin = false; + bool user_memset; + int rc = 0; + + user_dir = (user_dma_pkt->ctl & GOYA_PKT_LIN_DMA_CTL_DMA_DIR_MASK) >> + GOYA_PKT_LIN_DMA_CTL_DMA_DIR_SHIFT; + + user_memset = (user_dma_pkt->ctl & GOYA_PKT_LIN_DMA_CTL_MEMSET_MASK) >> + GOYA_PKT_LIN_DMA_CTL_MEMSET_SHIFT; + + switch (user_dir) { + case DMA_HOST_TO_DRAM: + dev_dbg(hdev->dev, "DMA direction is HOST --> DRAM\n"); + dir = DMA_TO_DEVICE; + sram_addr = false; + addr = user_dma_pkt->src_addr; + device_memory_addr = user_dma_pkt->dst_addr; + if (user_memset) + skip_host_mem_pin = true; + break; + + case DMA_DRAM_TO_HOST: + dev_dbg(hdev->dev, "DMA direction is DRAM --> HOST\n"); + dir = DMA_FROM_DEVICE; + sram_addr = false; + addr = user_dma_pkt->dst_addr; + device_memory_addr = user_dma_pkt->src_addr; + break; + + case DMA_HOST_TO_SRAM: + dev_dbg(hdev->dev, "DMA direction is HOST --> SRAM\n"); + dir = DMA_TO_DEVICE; + addr = user_dma_pkt->src_addr; + device_memory_addr = user_dma_pkt->dst_addr; + if (user_memset) + skip_host_mem_pin = true; + break; + + case DMA_SRAM_TO_HOST: + dev_dbg(hdev->dev, "DMA direction is SRAM --> HOST\n"); + dir = DMA_FROM_DEVICE; + addr = user_dma_pkt->dst_addr; + device_memory_addr = user_dma_pkt->src_addr; + break; + default: + dev_err(hdev->dev, "DMA direction is undefined\n"); + return -EFAULT; + } + + if (parser->ctx_id != HL_KERNEL_ASID_ID) { + if (sram_addr) { + if (!hl_mem_area_inside_range(device_memory_addr, + user_dma_pkt->tsize, + hdev->asic_prop.sram_user_base_address, + hdev->asic_prop.sram_end_address)) { + + dev_err(hdev->dev, + "SRAM address 0x%llx + 0x%x is invalid\n", + device_memory_addr, + user_dma_pkt->tsize); + return -EFAULT; + } + } else { + if (!hl_mem_area_inside_range(device_memory_addr, + user_dma_pkt->tsize, + hdev->asic_prop.dram_user_base_address, + hdev->asic_prop.dram_end_address)) { + + dev_err(hdev->dev, + "DRAM address 0x%llx + 0x%x is invalid\n", + device_memory_addr, + user_dma_pkt->tsize); + return -EFAULT; + } + } + } + + if (skip_host_mem_pin) + parser->patched_cb_size += sizeof(*user_dma_pkt); + else { + if ((dir == DMA_TO_DEVICE) && + (parser->hw_queue_id > GOYA_QUEUE_ID_DMA_1)) { + dev_err(hdev->dev, + "Can't DMA from host on queue other then 1\n"); + return -EFAULT; + } + + rc = goya_pin_memory_before_cs(hdev, parser, user_dma_pkt, + addr, dir); + } + + return rc; +} + +static int goya_validate_dma_pkt_no_host(struct hl_device *hdev, + struct hl_cs_parser *parser, + struct packet_lin_dma *user_dma_pkt) +{ + u64 sram_memory_addr, dram_memory_addr; + enum goya_dma_direction user_dir; + + user_dir = (user_dma_pkt->ctl & GOYA_PKT_LIN_DMA_CTL_DMA_DIR_MASK) >> + GOYA_PKT_LIN_DMA_CTL_DMA_DIR_SHIFT; + + if (user_dir == DMA_DRAM_TO_SRAM) { + dev_dbg(hdev->dev, "DMA direction is DRAM --> SRAM\n"); + dram_memory_addr = user_dma_pkt->src_addr; + sram_memory_addr = user_dma_pkt->dst_addr; + } else { + dev_dbg(hdev->dev, "DMA direction is SRAM --> DRAM\n"); + sram_memory_addr = user_dma_pkt->src_addr; + dram_memory_addr = user_dma_pkt->dst_addr; + } + + if (!hl_mem_area_inside_range(sram_memory_addr, user_dma_pkt->tsize, + hdev->asic_prop.sram_user_base_address, + hdev->asic_prop.sram_end_address)) { + dev_err(hdev->dev, "SRAM address 0x%llx + 0x%x is invalid\n", + sram_memory_addr, user_dma_pkt->tsize); + return -EFAULT; + } + + if (!hl_mem_area_inside_range(dram_memory_addr, user_dma_pkt->tsize, + hdev->asic_prop.dram_user_base_address, + hdev->asic_prop.dram_end_address)) { + dev_err(hdev->dev, "DRAM address 0x%llx + 0x%x is invalid\n", + dram_memory_addr, user_dma_pkt->tsize); + return -EFAULT; + } + + parser->patched_cb_size += sizeof(*user_dma_pkt); + + return 0; +} + +static int goya_validate_dma_pkt_no_mmu(struct hl_device *hdev, + struct hl_cs_parser *parser, + struct packet_lin_dma *user_dma_pkt) +{ + enum goya_dma_direction user_dir; + int rc; + + dev_dbg(hdev->dev, "DMA packet details:\n"); + dev_dbg(hdev->dev, "source == 0x%llx\n", user_dma_pkt->src_addr); + dev_dbg(hdev->dev, "destination == 0x%llx\n", user_dma_pkt->dst_addr); + dev_dbg(hdev->dev, "size == %u\n", user_dma_pkt->tsize); + + user_dir = (user_dma_pkt->ctl & GOYA_PKT_LIN_DMA_CTL_DMA_DIR_MASK) >> + GOYA_PKT_LIN_DMA_CTL_DMA_DIR_SHIFT; + + /* + * Special handling for DMA with size 0. The H/W has a bug where + * this can cause the QMAN DMA to get stuck, so block it here. + */ + if (user_dma_pkt->tsize == 0) { + dev_err(hdev->dev, + "Got DMA with size 0, might reset the device\n"); + return -EINVAL; + } + + if ((user_dir == DMA_DRAM_TO_SRAM) || (user_dir == DMA_SRAM_TO_DRAM)) + rc = goya_validate_dma_pkt_no_host(hdev, parser, user_dma_pkt); + else + rc = goya_validate_dma_pkt_host(hdev, parser, user_dma_pkt); + + return rc; +} + +static int goya_validate_dma_pkt_mmu(struct hl_device *hdev, + struct hl_cs_parser *parser, + struct packet_lin_dma *user_dma_pkt) +{ + dev_dbg(hdev->dev, "DMA packet details:\n"); + dev_dbg(hdev->dev, "source == 0x%llx\n", user_dma_pkt->src_addr); + dev_dbg(hdev->dev, "destination == 0x%llx\n", user_dma_pkt->dst_addr); + dev_dbg(hdev->dev, "size == %u\n", user_dma_pkt->tsize); + + /* + * WA for HW-23. + * We can't allow user to read from Host using QMANs other than 1. + */ + if (parser->hw_queue_id > GOYA_QUEUE_ID_DMA_1 && + hl_mem_area_inside_range(user_dma_pkt->src_addr, + user_dma_pkt->tsize, + hdev->asic_prop.va_space_host_start_address, + hdev->asic_prop.va_space_host_end_address)) { + dev_err(hdev->dev, + "Can't DMA from host on queue other then 1\n"); + return -EFAULT; + } + + if (user_dma_pkt->tsize == 0) { + dev_err(hdev->dev, + "Got DMA with size 0, might reset the device\n"); + return -EINVAL; + } + + parser->patched_cb_size += sizeof(*user_dma_pkt); + + return 0; +} + +static int goya_validate_wreg32(struct hl_device *hdev, + struct hl_cs_parser *parser, + struct packet_wreg32 *wreg_pkt) +{ + struct goya_device *goya = hdev->asic_specific; + u32 sob_start_addr, sob_end_addr; + u16 reg_offset; + + reg_offset = wreg_pkt->ctl & GOYA_PKT_WREG32_CTL_REG_OFFSET_MASK; + + dev_dbg(hdev->dev, "WREG32 packet details:\n"); + dev_dbg(hdev->dev, "reg_offset == 0x%x\n", reg_offset); + dev_dbg(hdev->dev, "value == 0x%x\n", wreg_pkt->value); + + if (reg_offset != (mmDMA_CH_1_WR_COMP_ADDR_LO & 0xFFFF)) { + dev_err(hdev->dev, "WREG32 packet with illegal address 0x%x\n", + reg_offset); + return -EPERM; + } + + /* + * With MMU, DMA channels are not secured, so it doesn't matter where + * the WR COMP will be written to because it will go out with + * non-secured property + */ + if (goya->hw_cap_initialized & HW_CAP_MMU) + return 0; + + sob_start_addr = lower_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_0); + sob_end_addr = lower_32_bits(CFG_BASE + mmSYNC_MNGR_SOB_OBJ_1023); + + if ((wreg_pkt->value < sob_start_addr) || + (wreg_pkt->value > sob_end_addr)) { + + dev_err(hdev->dev, "WREG32 packet with illegal value 0x%x\n", + wreg_pkt->value); + return -EPERM; + } + + return 0; +} + +static int goya_validate_cb(struct hl_device *hdev, + struct hl_cs_parser *parser, bool is_mmu) +{ + u32 cb_parsed_length = 0; + int rc = 0; + + parser->patched_cb_size = 0; + + /* cb_user_size is more than 0 so loop will always be executed */ + while (cb_parsed_length < parser->user_cb_size) { + enum packet_id pkt_id; + u16 pkt_size; + void *user_pkt; + + user_pkt = (void *) (uintptr_t) + (parser->user_cb->kernel_address + cb_parsed_length); + + pkt_id = (enum packet_id) (((*(u64 *) user_pkt) & + PACKET_HEADER_PACKET_ID_MASK) >> + PACKET_HEADER_PACKET_ID_SHIFT); + + pkt_size = goya_packet_sizes[pkt_id]; + cb_parsed_length += pkt_size; + if (cb_parsed_length > parser->user_cb_size) { + dev_err(hdev->dev, + "packet 0x%x is out of CB boundary\n", pkt_id); + rc = -EINVAL; + break; + } + + switch (pkt_id) { + case PACKET_WREG_32: + /* + * Although it is validated after copy in patch_cb(), + * need to validate here as well because patch_cb() is + * not called in MMU path while this function is called + */ + rc = goya_validate_wreg32(hdev, parser, user_pkt); + break; + + case PACKET_WREG_BULK: + dev_err(hdev->dev, + "User not allowed to use WREG_BULK\n"); + rc = -EPERM; + break; + + case PACKET_MSG_PROT: + dev_err(hdev->dev, + "User not allowed to use MSG_PROT\n"); + rc = -EPERM; + break; + + case PACKET_CP_DMA: + dev_err(hdev->dev, "User not allowed to use CP_DMA\n"); + rc = -EPERM; + break; + + case PACKET_STOP: + dev_err(hdev->dev, "User not allowed to use STOP\n"); + rc = -EPERM; + break; + + case PACKET_LIN_DMA: + if (is_mmu) + rc = goya_validate_dma_pkt_mmu(hdev, parser, + user_pkt); + else + rc = goya_validate_dma_pkt_no_mmu(hdev, parser, + user_pkt); + break; + + case PACKET_MSG_LONG: + case PACKET_MSG_SHORT: + case PACKET_FENCE: + case PACKET_NOP: + parser->patched_cb_size += pkt_size; + break; + + default: + dev_err(hdev->dev, "Invalid packet header 0x%x\n", + pkt_id); + rc = -EINVAL; + break; + } + + if (rc) + break; + } + + /* + * The new CB should have space at the end for two MSG_PROT packets: + * 1. A packet that will act as a completion packet + * 2. A packet that will generate MSI-X interrupt + */ + parser->patched_cb_size += sizeof(struct packet_msg_prot) * 2; + + return rc; +} + +static int goya_patch_dma_packet(struct hl_device *hdev, + struct hl_cs_parser *parser, + struct packet_lin_dma *user_dma_pkt, + struct packet_lin_dma *new_dma_pkt, + u32 *new_dma_pkt_size) +{ + struct hl_userptr *userptr; + struct scatterlist *sg, *sg_next_iter; + u32 count, len, dma_desc_cnt, len_next; + dma_addr_t dma_addr, dma_addr_next; + enum goya_dma_direction user_dir; + u64 device_memory_addr, addr; + enum dma_data_direction dir; + struct sg_table *sgt; + bool skip_host_mem_pin = false; + bool user_memset; + u32 user_rdcomp_mask, user_wrcomp_mask; + + user_dir = (user_dma_pkt->ctl & GOYA_PKT_LIN_DMA_CTL_DMA_DIR_MASK) >> + GOYA_PKT_LIN_DMA_CTL_DMA_DIR_SHIFT; + + user_memset = (user_dma_pkt->ctl & GOYA_PKT_LIN_DMA_CTL_MEMSET_MASK) >> + GOYA_PKT_LIN_DMA_CTL_MEMSET_SHIFT; + + if ((user_dir == DMA_DRAM_TO_SRAM) || (user_dir == DMA_SRAM_TO_DRAM) || + (user_dma_pkt->tsize == 0)) { + memcpy(new_dma_pkt, user_dma_pkt, sizeof(*new_dma_pkt)); + *new_dma_pkt_size = sizeof(*new_dma_pkt); + return 0; + } + + if ((user_dir == DMA_HOST_TO_DRAM) || (user_dir == DMA_HOST_TO_SRAM)) { + addr = user_dma_pkt->src_addr; + device_memory_addr = user_dma_pkt->dst_addr; + dir = DMA_TO_DEVICE; + if (user_memset) + skip_host_mem_pin = true; + } else { + addr = user_dma_pkt->dst_addr; + device_memory_addr = user_dma_pkt->src_addr; + dir = DMA_FROM_DEVICE; + } + + if ((!skip_host_mem_pin) && + (hl_userptr_is_pinned(hdev, addr, user_dma_pkt->tsize, + parser->job_userptr_list, &userptr) == false)) { + dev_err(hdev->dev, "Userptr 0x%llx + 0x%x NOT mapped\n", + addr, user_dma_pkt->tsize); + return -EFAULT; + } + + if ((user_memset) && (dir == DMA_TO_DEVICE)) { + memcpy(new_dma_pkt, user_dma_pkt, sizeof(*user_dma_pkt)); + *new_dma_pkt_size = sizeof(*user_dma_pkt); + return 0; + } + + user_rdcomp_mask = + (user_dma_pkt->ctl & GOYA_PKT_LIN_DMA_CTL_RDCOMP_MASK); + + user_wrcomp_mask = + (user_dma_pkt->ctl & GOYA_PKT_LIN_DMA_CTL_WRCOMP_MASK); + + sgt = userptr->sgt; + dma_desc_cnt = 0; + + for_each_sg(sgt->sgl, sg, sgt->nents, count) { + len = sg_dma_len(sg); + dma_addr = sg_dma_address(sg); + + if (len == 0) + break; + + while ((count + 1) < sgt->nents) { + sg_next_iter = sg_next(sg); + len_next = sg_dma_len(sg_next_iter); + dma_addr_next = sg_dma_address(sg_next_iter); + + if (len_next == 0) + break; + + if ((dma_addr + len == dma_addr_next) && + (len + len_next <= DMA_MAX_TRANSFER_SIZE)) { + len += len_next; + count++; + sg = sg_next_iter; + } else { + break; + } + } + + new_dma_pkt->ctl = user_dma_pkt->ctl; + if (likely(dma_desc_cnt)) + new_dma_pkt->ctl &= ~GOYA_PKT_CTL_EB_MASK; + new_dma_pkt->ctl &= ~(GOYA_PKT_LIN_DMA_CTL_RDCOMP_MASK | + GOYA_PKT_LIN_DMA_CTL_WRCOMP_MASK); + new_dma_pkt->tsize = len; + + dma_addr += hdev->asic_prop.host_phys_base_address; + + if (dir == DMA_TO_DEVICE) { + new_dma_pkt->src_addr = dma_addr; + new_dma_pkt->dst_addr = device_memory_addr; + } else { + new_dma_pkt->src_addr = device_memory_addr; + new_dma_pkt->dst_addr = dma_addr; + } + + if (!user_memset) + device_memory_addr += len; + dma_desc_cnt++; + new_dma_pkt++; + } + + if (!dma_desc_cnt) { + dev_err(hdev->dev, + "Error of 0 SG entries when patching DMA packet\n"); + return -EFAULT; + } + + /* Fix the last dma packet - rdcomp/wrcomp must be as user set them */ + new_dma_pkt--; + new_dma_pkt->ctl |= (user_rdcomp_mask | user_wrcomp_mask); + + *new_dma_pkt_size = dma_desc_cnt * sizeof(struct packet_lin_dma); + + return 0; +} + +static int goya_patch_cb(struct hl_device *hdev, + struct hl_cs_parser *parser) +{ + u32 cb_parsed_length = 0; + u32 cb_patched_cur_length = 0; + int rc = 0; + + /* cb_user_size is more than 0 so loop will always be executed */ + while (cb_parsed_length < parser->user_cb_size) { + enum packet_id pkt_id; + u16 pkt_size; + u32 new_pkt_size = 0; + void *user_pkt, *kernel_pkt; + + user_pkt = (void *) (uintptr_t) + (parser->user_cb->kernel_address + cb_parsed_length); + kernel_pkt = (void *) (uintptr_t) + (parser->patched_cb->kernel_address + + cb_patched_cur_length); + + pkt_id = (enum packet_id) (((*(u64 *) user_pkt) & + PACKET_HEADER_PACKET_ID_MASK) >> + PACKET_HEADER_PACKET_ID_SHIFT); + + pkt_size = goya_packet_sizes[pkt_id]; + cb_parsed_length += pkt_size; + if (cb_parsed_length > parser->user_cb_size) { + dev_err(hdev->dev, + "packet 0x%x is out of CB boundary\n", pkt_id); + rc = -EINVAL; + break; + } + + switch (pkt_id) { + case PACKET_LIN_DMA: + rc = goya_patch_dma_packet(hdev, parser, user_pkt, + kernel_pkt, &new_pkt_size); + cb_patched_cur_length += new_pkt_size; + break; + + case PACKET_WREG_32: + memcpy(kernel_pkt, user_pkt, pkt_size); + cb_patched_cur_length += pkt_size; + rc = goya_validate_wreg32(hdev, parser, kernel_pkt); + break; + + case PACKET_WREG_BULK: + dev_err(hdev->dev, + "User not allowed to use WREG_BULK\n"); + rc = -EPERM; + break; + + case PACKET_MSG_PROT: + dev_err(hdev->dev, + "User not allowed to use MSG_PROT\n"); + rc = -EPERM; + break; + + case PACKET_CP_DMA: + dev_err(hdev->dev, "User not allowed to use CP_DMA\n"); + rc = -EPERM; + break; + + case PACKET_STOP: + dev_err(hdev->dev, "User not allowed to use STOP\n"); + rc = -EPERM; + break; + + case PACKET_MSG_LONG: + case PACKET_MSG_SHORT: + case PACKET_FENCE: + case PACKET_NOP: + memcpy(kernel_pkt, user_pkt, pkt_size); + cb_patched_cur_length += pkt_size; + break; + + default: + dev_err(hdev->dev, "Invalid packet header 0x%x\n", + pkt_id); + rc = -EINVAL; + break; + } + + if (rc) + break; + } + + return rc; +} + +static int goya_parse_cb_mmu(struct hl_device *hdev, + struct hl_cs_parser *parser) +{ + u64 patched_cb_handle; + u32 patched_cb_size; + struct hl_cb *user_cb; + int rc; + + /* + * The new CB should have space at the end for two MSG_PROT pkt: + * 1. A packet that will act as a completion packet + * 2. A packet that will generate MSI-X interrupt + */ + parser->patched_cb_size = parser->user_cb_size + + sizeof(struct packet_msg_prot) * 2; + + rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, + parser->patched_cb_size, + &patched_cb_handle, HL_KERNEL_ASID_ID); + + if (rc) { + dev_err(hdev->dev, + "Failed to allocate patched CB for DMA CS %d\n", + rc); + return rc; + } + + patched_cb_handle >>= PAGE_SHIFT; + parser->patched_cb = hl_cb_get(hdev, &hdev->kernel_cb_mgr, + (u32) patched_cb_handle); + /* hl_cb_get should never fail here so use kernel WARN */ + WARN(!parser->patched_cb, "DMA CB handle invalid 0x%x\n", + (u32) patched_cb_handle); + if (!parser->patched_cb) { + rc = -EFAULT; + goto out; + } + + /* + * The check that parser->user_cb_size <= parser->user_cb->size was done + * in validate_queue_index(). + */ + memcpy((void *) (uintptr_t) parser->patched_cb->kernel_address, + (void *) (uintptr_t) parser->user_cb->kernel_address, + parser->user_cb_size); + + patched_cb_size = parser->patched_cb_size; + + /* validate patched CB instead of user CB */ + user_cb = parser->user_cb; + parser->user_cb = parser->patched_cb; + rc = goya_validate_cb(hdev, parser, true); + parser->user_cb = user_cb; + + if (rc) { + hl_cb_put(parser->patched_cb); + goto out; + } + + if (patched_cb_size != parser->patched_cb_size) { + dev_err(hdev->dev, "user CB size mismatch\n"); + hl_cb_put(parser->patched_cb); + rc = -EINVAL; + goto out; + } + +out: + /* + * Always call cb destroy here because we still have 1 reference + * to it by calling cb_get earlier. After the job will be completed, + * cb_put will release it, but here we want to remove it from the + * idr + */ + hl_cb_destroy(hdev, &hdev->kernel_cb_mgr, + patched_cb_handle << PAGE_SHIFT); + + return rc; +} + +int goya_parse_cb_no_mmu(struct hl_device *hdev, struct hl_cs_parser *parser) +{ + u64 patched_cb_handle; + int rc; + + rc = goya_validate_cb(hdev, parser, false); + + if (rc) + goto free_userptr; + + rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, + parser->patched_cb_size, + &patched_cb_handle, HL_KERNEL_ASID_ID); + if (rc) { + dev_err(hdev->dev, + "Failed to allocate patched CB for DMA CS %d\n", rc); + goto free_userptr; + } + + patched_cb_handle >>= PAGE_SHIFT; + parser->patched_cb = hl_cb_get(hdev, &hdev->kernel_cb_mgr, + (u32) patched_cb_handle); + /* hl_cb_get should never fail here so use kernel WARN */ + WARN(!parser->patched_cb, "DMA CB handle invalid 0x%x\n", + (u32) patched_cb_handle); + if (!parser->patched_cb) { + rc = -EFAULT; + goto out; + } + + rc = goya_patch_cb(hdev, parser); + + if (rc) + hl_cb_put(parser->patched_cb); + +out: + /* + * Always call cb destroy here because we still have 1 reference + * to it by calling cb_get earlier. After the job will be completed, + * cb_put will release it, but here we want to remove it from the + * idr + */ + hl_cb_destroy(hdev, &hdev->kernel_cb_mgr, + patched_cb_handle << PAGE_SHIFT); + +free_userptr: + if (rc) + hl_userptr_delete_list(hdev, parser->job_userptr_list); + return rc; +} + +int goya_parse_cb_no_ext_quque(struct hl_device *hdev, + struct hl_cs_parser *parser) +{ + struct asic_fixed_properties *asic_prop = &hdev->asic_prop; + struct goya_device *goya = hdev->asic_specific; + + if (!(goya->hw_cap_initialized & HW_CAP_MMU)) { + /* For internal queue jobs, just check if cb address is valid */ + if (hl_mem_area_inside_range( + (u64) (uintptr_t) parser->user_cb, + parser->user_cb_size, + asic_prop->sram_user_base_address, + asic_prop->sram_end_address)) + return 0; + + if (hl_mem_area_inside_range( + (u64) (uintptr_t) parser->user_cb, + parser->user_cb_size, + asic_prop->dram_user_base_address, + asic_prop->dram_end_address)) + return 0; + + dev_err(hdev->dev, + "Internal CB address 0x%llx + 0x%x is not in SRAM nor in DRAM\n", + (u64) (uintptr_t) parser->user_cb, + parser->user_cb_size); + + return -EFAULT; + } + + return 0; +} + +int goya_cs_parser(struct hl_device *hdev, struct hl_cs_parser *parser) +{ + struct goya_device *goya = hdev->asic_specific; + + if (!parser->ext_queue) + return goya_parse_cb_no_ext_quque(hdev, parser); + + if ((goya->hw_cap_initialized & HW_CAP_MMU) && parser->use_virt_addr) + return goya_parse_cb_mmu(hdev, parser); + else + return goya_parse_cb_no_mmu(hdev, parser); +} + +void goya_add_end_of_cb_packets(u64 kernel_address, u32 len, u64 cq_addr, + u32 cq_val, u32 msix_vec) +{ + struct packet_msg_prot *cq_pkt; + + cq_pkt = (struct packet_msg_prot *) (uintptr_t) + (kernel_address + len - (sizeof(struct packet_msg_prot) * 2)); + + cq_pkt->ctl = (PACKET_MSG_PROT << GOYA_PKT_CTL_OPCODE_SHIFT) | + (1 << GOYA_PKT_CTL_EB_SHIFT) | + (1 << GOYA_PKT_CTL_MB_SHIFT); + cq_pkt->value = cq_val; + cq_pkt->addr = cq_addr; + + cq_pkt++; + + cq_pkt->ctl = (PACKET_MSG_PROT << GOYA_PKT_CTL_OPCODE_SHIFT) | + (1 << GOYA_PKT_CTL_MB_SHIFT); + cq_pkt->value = msix_vec & 0x7FF; + cq_pkt->addr = CFG_BASE + mmPCIE_DBI_MSIX_DOORBELL_OFF; +} + static void goya_update_eq_ci(struct hl_device *hdev, u32 val) { WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_6, val); } +int goya_context_switch(struct hl_device *hdev, u32 asid) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + struct packet_lin_dma *clear_sram_pkt; + struct hl_cs_parser parser; + struct hl_cs_job *job; + u32 cb_size; + struct hl_cb *cb; + int rc; + + cb = hl_cb_kernel_create(hdev, PAGE_SIZE); + if (!cb) + return -EFAULT; + + clear_sram_pkt = (struct packet_lin_dma *) + (uintptr_t) cb->kernel_address; + + memset(clear_sram_pkt, 0, sizeof(*clear_sram_pkt)); + cb_size = sizeof(*clear_sram_pkt); + + clear_sram_pkt->ctl = ((PACKET_LIN_DMA << GOYA_PKT_CTL_OPCODE_SHIFT) | + (DMA_HOST_TO_SRAM << GOYA_PKT_LIN_DMA_CTL_DMA_DIR_SHIFT) | + (1 << GOYA_PKT_LIN_DMA_CTL_MEMSET_SHIFT) | + (1 << GOYA_PKT_LIN_DMA_CTL_WO_SHIFT) | + (1 << GOYA_PKT_CTL_RB_SHIFT) | + (1 << GOYA_PKT_CTL_MB_SHIFT)); + + clear_sram_pkt->src_addr = 0x7777777777777777ull; + clear_sram_pkt->dst_addr = prop->sram_base_address; + if (hdev->pldm) + clear_sram_pkt->tsize = 0x10000; + else + clear_sram_pkt->tsize = prop->sram_size; + + job = hl_cs_allocate_job(hdev, true); + if (!job) { + dev_err(hdev->dev, "Failed to allocate a new job\n"); + rc = -ENOMEM; + goto release_cb; + } + + job->id = 0; + job->user_cb = cb; + job->user_cb->cs_cnt++; + job->user_cb_size = cb_size; + job->hw_queue_id = GOYA_QUEUE_ID_DMA_0; + + parser.ctx_id = HL_KERNEL_ASID_ID; + parser.cs_sequence = 0; + parser.job_id = job->id; + parser.hw_queue_id = job->hw_queue_id; + parser.job_userptr_list = &job->userptr_list; + parser.user_cb = job->user_cb; + parser.user_cb_size = job->user_cb_size; + parser.ext_queue = job->ext_queue; + parser.use_virt_addr = hdev->mmu_enable; + + rc = hdev->asic_funcs->cs_parser(hdev, &parser); + if (rc) { + dev_err(hdev->dev, + "Failed to parse kernel CB during context switch\n"); + goto free_job; + } + + job->patched_cb = parser.patched_cb; + job->job_cb_size = parser.patched_cb_size; + job->patched_cb->cs_cnt++; + + rc = goya_send_job_on_qman0(hdev, job); + + job->patched_cb->cs_cnt--; + hl_cb_put(job->patched_cb); + +free_job: + hl_userptr_delete_list(hdev, &job->userptr_list); + kfree(job); + cb->cs_cnt--; + +release_cb: + hl_cb_put(cb); + hl_cb_destroy(hdev, &hdev->kernel_cb_mgr, cb->id << PAGE_SHIFT); + + return rc; +} + +void goya_restore_phase_topology(struct hl_device *hdev) +{ + int i, num_of_sob_in_longs, num_of_mon_in_longs; + + num_of_sob_in_longs = + ((mmSYNC_MNGR_SOB_OBJ_1023 - mmSYNC_MNGR_SOB_OBJ_0) + 4); + + num_of_mon_in_longs = + ((mmSYNC_MNGR_MON_STATUS_255 - mmSYNC_MNGR_MON_STATUS_0) + 4); + + for (i = 0 ; i < num_of_sob_in_longs ; i += 4) + WREG32(mmSYNC_MNGR_SOB_OBJ_0 + i, 0); + + for (i = 0 ; i < num_of_mon_in_longs ; i += 4) + WREG32(mmSYNC_MNGR_MON_STATUS_0 + i, 0); + + /* Flush all WREG to prevent race */ + i = RREG32(mmSYNC_MNGR_SOB_OBJ_0); +} + static void goya_get_axi_name(struct hl_device *hdev, u32 agent_id, u16 event_type, char *axi_name, int len) { @@ -3608,6 +4673,59 @@ static void goya_disable_clock_gating(struct hl_device *hdev) } +static bool goya_is_device_idle(struct hl_device *hdev) +{ + u64 offset, dma_qm_reg, tpc_qm_reg, tpc_cmdq_reg, tpc_cfg_reg; + int i; + + offset = mmDMA_QM_1_GLBL_STS0 - mmDMA_QM_0_GLBL_STS0; + + for (i = 0 ; i < DMA_MAX_NUM ; i++) { + dma_qm_reg = mmDMA_QM_0_GLBL_STS0 + i * offset; + + if ((RREG32(dma_qm_reg) & DMA_QM_IDLE_MASK) != + DMA_QM_IDLE_MASK) + return false; + } + + offset = mmTPC1_QM_GLBL_STS0 - mmTPC0_QM_GLBL_STS0; + + for (i = 0 ; i < TPC_MAX_NUM ; i++) { + tpc_qm_reg = mmTPC0_QM_GLBL_STS0 + i * offset; + tpc_cmdq_reg = mmTPC0_CMDQ_GLBL_STS0 + i * offset; + tpc_cfg_reg = mmTPC0_CFG_STATUS + i * offset; + + if ((RREG32(tpc_qm_reg) & TPC_QM_IDLE_MASK) != + TPC_QM_IDLE_MASK) + return false; + + if ((RREG32(tpc_cmdq_reg) & TPC_CMDQ_IDLE_MASK) != + TPC_CMDQ_IDLE_MASK) + return false; + + if ((RREG32(tpc_cfg_reg) & TPC_CFG_IDLE_MASK) != + TPC_CFG_IDLE_MASK) + return false; + } + + if ((RREG32(mmMME_QM_GLBL_STS0) & MME_QM_IDLE_MASK) != + MME_QM_IDLE_MASK) + return false; + + if ((RREG32(mmMME_CMDQ_GLBL_STS0) & MME_CMDQ_IDLE_MASK) != + MME_CMDQ_IDLE_MASK) + return false; + + if ((RREG32(mmMME_ARCH_STATUS) & MME_ARCH_IDLE_MASK) != + MME_ARCH_IDLE_MASK) + return false; + + if (RREG32(mmMME_SHADOW_0_STATUS) & MME_SHADOW_IDLE_MASK) + return false; + + return true; +} + static void goya_hw_queues_lock(struct hl_device *hdev) { struct goya_device *goya = hdev->asic_specific; @@ -3700,7 +4818,14 @@ static const struct hl_asic_funcs goya_funcs = { .dma_pool_free = goya_dma_pool_free, .cpu_accessible_dma_pool_alloc = goya_cpu_accessible_dma_pool_alloc, .cpu_accessible_dma_pool_free = goya_cpu_accessible_dma_pool_free, + .hl_dma_unmap_sg = goya_dma_unmap_sg, + .cs_parser = goya_cs_parser, + .asic_dma_map_sg = goya_dma_map_sg, + .get_dma_desc_list_size = goya_get_dma_desc_list_size, + .add_end_of_cb_packets = goya_add_end_of_cb_packets, .update_eq_ci = goya_update_eq_ci, + .context_switch = goya_context_switch, + .restore_phase_topology = goya_restore_phase_topology, .add_device_attr = goya_add_device_attr, .handle_eqe = goya_handle_eqe, .set_pll_profile = goya_set_pll_profile, @@ -3708,6 +4833,7 @@ static const struct hl_asic_funcs goya_funcs = { .send_heartbeat = goya_send_heartbeat, .enable_clock_gating = goya_init_clock_gating, .disable_clock_gating = goya_disable_clock_gating, + .is_device_idle = goya_is_device_idle, .soft_reset_late_init = goya_soft_reset_late_init, .hw_queues_lock = goya_hw_queues_lock, .hw_queues_unlock = goya_hw_queues_unlock, |