summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2023-04-11 14:37:20 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2023-04-11 14:37:20 +1000
commit2909cca706bf8c6dd62f9c3b14e98a46e3ed3f42 (patch)
tree9ee11f3af04fe11c897f3235f8a0b57d5f9d6810
parentdea3215f33f2ba4e29046036d13de1df9c18b2da (diff)
parent56499c461589634f2c89ffbd9cfb78268191d349 (diff)
downloadlinux-next-2909cca706bf8c6dd62f9c3b14e98a46e3ed3f42.tar.gz
Merge branch 'habanalabs-next' of git://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux.git
-rw-r--r--drivers/accel/habanalabs/common/command_buffer.c15
-rw-r--r--drivers/accel/habanalabs/common/decoder.c40
-rw-r--r--drivers/accel/habanalabs/common/device.c54
-rw-r--r--drivers/accel/habanalabs/common/firmware_if.c17
-rw-r--r--drivers/accel/habanalabs/common/habanalabs.h14
-rw-r--r--drivers/accel/habanalabs/common/irq.c11
-rw-r--r--drivers/accel/habanalabs/common/memory.c11
-rw-r--r--drivers/accel/habanalabs/common/mmu/mmu.c8
-rw-r--r--drivers/accel/habanalabs/common/pci/pci.c2
-rw-r--r--drivers/accel/habanalabs/common/sysfs.c6
-rw-r--r--drivers/accel/habanalabs/gaudi/gaudi.c86
-rw-r--r--drivers/accel/habanalabs/gaudi/gaudiP.h15
-rw-r--r--drivers/accel/habanalabs/gaudi2/gaudi2.c347
-rw-r--r--drivers/accel/habanalabs/gaudi2/gaudi2P.h17
-rw-r--r--drivers/accel/habanalabs/goya/goya.c1
-rw-r--r--drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h4
-rw-r--r--include/uapi/drm/habanalabs_accel.h3
17 files changed, 382 insertions, 269 deletions
diff --git a/drivers/accel/habanalabs/common/command_buffer.c b/drivers/accel/habanalabs/common/command_buffer.c
index 3a0535ac28b1..6e09f48750a0 100644
--- a/drivers/accel/habanalabs/common/command_buffer.c
+++ b/drivers/accel/habanalabs/common/command_buffer.c
@@ -45,20 +45,29 @@ static int cb_map_mem(struct hl_ctx *ctx, struct hl_cb *cb)
}
mutex_lock(&hdev->mmu_lock);
+
rc = hl_mmu_map_contiguous(ctx, cb->virtual_addr, cb->bus_address, cb->roundup_size);
if (rc) {
dev_err(hdev->dev, "Failed to map VA %#llx to CB\n", cb->virtual_addr);
- goto err_va_umap;
+ goto err_va_pool_free;
}
+
rc = hl_mmu_invalidate_cache(hdev, false, MMU_OP_USERPTR | MMU_OP_SKIP_LOW_CACHE_INV);
+ if (rc)
+ goto err_mmu_unmap;
+
mutex_unlock(&hdev->mmu_lock);
cb->is_mmu_mapped = true;
- return rc;
-err_va_umap:
+ return 0;
+
+err_mmu_unmap:
+ hl_mmu_unmap_contiguous(ctx, cb->virtual_addr, cb->roundup_size);
+err_va_pool_free:
mutex_unlock(&hdev->mmu_lock);
gen_pool_free(ctx->cb_va_pool, cb->virtual_addr, cb->roundup_size);
+
return rc;
}
diff --git a/drivers/accel/habanalabs/common/decoder.c b/drivers/accel/habanalabs/common/decoder.c
index 69c78c1784b4..c03a6da45d00 100644
--- a/drivers/accel/habanalabs/common/decoder.c
+++ b/drivers/accel/habanalabs/common/decoder.c
@@ -43,48 +43,46 @@ static void dec_print_abnrm_intr_source(struct hl_device *hdev, u32 irq_status)
intr_source[2], intr_source[3], intr_source[4], intr_source[5]);
}
-static void dec_error_intr_work(struct hl_device *hdev, u32 base_addr, u32 core_id)
+static void dec_abnrm_intr_work(struct work_struct *work)
{
+ struct hl_dec *dec = container_of(work, struct hl_dec, abnrm_intr_work);
+ struct hl_device *hdev = dec->hdev;
+ u32 irq_status, event_mask = 0;
bool reset_required = false;
- u32 irq_status, event_mask;
- irq_status = RREG32(base_addr + VCMD_IRQ_STATUS_OFFSET);
+ irq_status = RREG32(dec->base_addr + VCMD_IRQ_STATUS_OFFSET);
- dev_err(hdev->dev, "Decoder abnormal interrupt %#x, core %d\n", irq_status, core_id);
+ dev_err(hdev->dev, "Decoder abnormal interrupt %#x, core %d\n", irq_status, dec->core_id);
dec_print_abnrm_intr_source(hdev, irq_status);
/* Clear the interrupt */
- WREG32(base_addr + VCMD_IRQ_STATUS_OFFSET, irq_status);
+ WREG32(dec->base_addr + VCMD_IRQ_STATUS_OFFSET, irq_status);
/* Flush the interrupt clear */
- RREG32(base_addr + VCMD_IRQ_STATUS_OFFSET);
+ RREG32(dec->base_addr + VCMD_IRQ_STATUS_OFFSET);
if (irq_status & VCMD_IRQ_STATUS_TIMEOUT_MASK) {
reset_required = true;
- event_mask = HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
- } else if (irq_status & VCMD_IRQ_STATUS_CMDERR_MASK) {
- event_mask = HL_NOTIFIER_EVENT_UNDEFINED_OPCODE;
- } else {
- event_mask = HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
+ event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
}
+ if (irq_status & VCMD_IRQ_STATUS_CMDERR_MASK)
+ event_mask |= HL_NOTIFIER_EVENT_UNDEFINED_OPCODE;
+
+ if (irq_status & (VCMD_IRQ_STATUS_ENDCMD_MASK |
+ VCMD_IRQ_STATUS_BUSERR_MASK |
+ VCMD_IRQ_STATUS_ABORT_MASK))
+ event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
+
if (reset_required) {
event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET;
hl_device_cond_reset(hdev, 0, event_mask);
- } else {
+ } else if (event_mask) {
hl_notifier_event_send_all(hdev, event_mask);
}
}
-static void dec_completion_abnrm(struct work_struct *work)
-{
- struct hl_dec *dec = container_of(work, struct hl_dec, completion_abnrm_work);
- struct hl_device *hdev = dec->hdev;
-
- dec_error_intr_work(hdev, dec->base_addr, dec->core_id);
-}
-
void hl_dec_fini(struct hl_device *hdev)
{
kfree(hdev->dec);
@@ -108,7 +106,7 @@ int hl_dec_init(struct hl_device *hdev)
dec = hdev->dec + j;
dec->hdev = hdev;
- INIT_WORK(&dec->completion_abnrm_work, dec_completion_abnrm);
+ INIT_WORK(&dec->abnrm_intr_work, dec_abnrm_intr_work);
dec->core_id = j;
dec->base_addr = hdev->asic_funcs->get_dec_base_addr(hdev, j);
if (!dec->base_addr) {
diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index 713005998cbc..fabfc501ef54 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -1271,7 +1271,6 @@ int hl_device_resume(struct hl_device *hdev)
return 0;
disable_device:
- pci_clear_master(hdev->pdev);
pci_disable_device(hdev->pdev);
return rc;
@@ -1381,6 +1380,34 @@ static void device_disable_open_processes(struct hl_device *hdev, bool control_d
mutex_unlock(fd_lock);
}
+static void send_disable_pci_access(struct hl_device *hdev, u32 flags)
+{
+ /* If reset is due to heartbeat, device CPU is no responsive in
+ * which case no point sending PCI disable message to it.
+ */
+ if ((flags & HL_DRV_RESET_HARD) &&
+ !(flags & (HL_DRV_RESET_HEARTBEAT | HL_DRV_RESET_BYPASS_REQ_TO_FW))) {
+ /* Disable PCI access from device F/W so he won't send
+ * us additional interrupts. We disable MSI/MSI-X at
+ * the halt_engines function and we can't have the F/W
+ * sending us interrupts after that. We need to disable
+ * the access here because if the device is marked
+ * disable, the message won't be send. Also, in case
+ * of heartbeat, the device CPU is marked as disable
+ * so this message won't be sent
+ */
+ if (hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0)) {
+ dev_warn(hdev->dev, "Failed to disable FW's PCI access\n");
+ return;
+ }
+
+ /* verify that last EQs are handled before disabled is set */
+ if (hdev->cpu_queues_enable)
+ synchronize_irq(pci_irq_vector(hdev->pdev,
+ hdev->asic_prop.eq_interrupt_id));
+ }
+}
+
static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
{
u32 cur_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
@@ -1419,28 +1446,6 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
} else {
hdev->reset_info.reset_trigger_repeated = 1;
}
-
- /* If reset is due to heartbeat, device CPU is no responsive in
- * which case no point sending PCI disable message to it.
- *
- * If F/W is performing the reset, no need to send it a message to disable
- * PCI access
- */
- if ((flags & HL_DRV_RESET_HARD) &&
- !(flags & (HL_DRV_RESET_HEARTBEAT | HL_DRV_RESET_BYPASS_REQ_TO_FW))) {
- /* Disable PCI access from device F/W so he won't send
- * us additional interrupts. We disable MSI/MSI-X at
- * the halt_engines function and we can't have the F/W
- * sending us interrupts after that. We need to disable
- * the access here because if the device is marked
- * disable, the message won't be send. Also, in case
- * of heartbeat, the device CPU is marked as disable
- * so this message won't be sent
- */
- if (hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0))
- dev_warn(hdev->dev,
- "Failed to disable FW's PCI access\n");
- }
}
/*
@@ -1561,6 +1566,7 @@ do_reset:
escalate_reset_flow:
handle_reset_trigger(hdev, flags);
+ send_disable_pci_access(hdev, flags);
/* This also blocks future CS/VM/JOB completion operations */
hdev->disabled = true;
@@ -1823,9 +1829,7 @@ kill_processes:
dev_info(hdev->dev, "Performing hard reset scheduled during compute reset\n");
flags = hdev->reset_info.hard_reset_schedule_flags;
hdev->reset_info.hard_reset_schedule_flags = 0;
- hdev->disabled = true;
hard_reset = true;
- handle_reset_trigger(hdev, flags);
goto escalate_reset_flow;
}
}
diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c
index 7ea611392f8c..59f61ec66445 100644
--- a/drivers/accel/habanalabs/common/firmware_if.c
+++ b/drivers/accel/habanalabs/common/firmware_if.c
@@ -71,7 +71,7 @@ free_fw_ver:
return NULL;
}
-static int extract_fw_sub_versions(struct hl_device *hdev, char *preboot_ver)
+static int hl_get_preboot_major_minor(struct hl_device *hdev, char *preboot_ver)
{
char major[8], minor[8], *first_dot, *second_dot;
int rc;
@@ -86,7 +86,7 @@ static int extract_fw_sub_versions(struct hl_device *hdev, char *preboot_ver)
if (rc) {
dev_err(hdev->dev, "Error %d parsing preboot major version\n", rc);
- goto out;
+ return rc;
}
/* skip the first dot */
@@ -102,9 +102,6 @@ static int extract_fw_sub_versions(struct hl_device *hdev, char *preboot_ver)
if (rc)
dev_err(hdev->dev, "Error %d parsing preboot minor version\n", rc);
-
-out:
- kfree(preboot_ver);
return rc;
}
@@ -1263,7 +1260,7 @@ void hl_fw_ask_hard_reset_without_linux(struct hl_device *hdev)
COMMS_RST_DEV, 0, false,
hdev->fw_loader.cpu_timeout);
if (rc)
- dev_warn(hdev->dev, "Failed sending COMMS_RST_DEV\n");
+ dev_err(hdev->dev, "Failed sending COMMS_RST_DEV\n");
} else {
WREG32(static_loader->kmd_msg_to_cpu_reg, KMD_MSG_RST_DEV);
}
@@ -1281,10 +1278,10 @@ void hl_fw_ask_halt_machine_without_linux(struct hl_device *hdev)
/* Stop device CPU to make sure nothing bad happens */
if (hdev->asic_prop.dynamic_fw_load) {
rc = hl_fw_dynamic_send_protocol_cmd(hdev, &hdev->fw_loader,
- COMMS_GOTO_WFE, 0, true,
+ COMMS_GOTO_WFE, 0, false,
hdev->fw_loader.cpu_timeout);
if (rc)
- dev_warn(hdev->dev, "Failed sending COMMS_GOTO_WFE\n");
+ dev_err(hdev->dev, "Failed sending COMMS_GOTO_WFE\n");
} else {
WREG32(static_loader->kmd_msg_to_cpu_reg, KMD_MSG_GOTO_WFE);
msleep(static_loader->cpu_reset_wait_msec);
@@ -2181,8 +2178,8 @@ static int hl_fw_dynamic_read_device_fw_version(struct hl_device *hdev,
dev_info(hdev->dev, "preboot version %s\n", preboot_ver);
- /* This function takes care of freeing preboot_ver */
- rc = extract_fw_sub_versions(hdev, preboot_ver);
+ rc = hl_get_preboot_major_minor(hdev, preboot_ver);
+ kfree(preboot_ver);
if (rc)
return rc;
}
diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index a6f5c2152b0a..eaae69a9f817 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -662,7 +662,7 @@ struct hl_hints_range {
* @user_interrupt_count: number of user interrupts.
* @user_dec_intr_count: number of decoder interrupts exposed to user.
* @tpc_interrupt_id: interrupt id for TPC to use in order to raise events towards the host.
- * @unexpected_user_error_interrupt_id: interrupt id used to indicate an unexpected user error.
+ * @eq_interrupt_id: interrupt id for EQ, uses to synchronize EQ interrupts in hard-reset.
* @cache_line_size: device cache line size.
* @server_type: Server type that the ASIC is currently installed in.
* The value is according to enum hl_server_type in uapi file.
@@ -793,7 +793,7 @@ struct asic_fixed_properties {
u16 user_interrupt_count;
u16 user_dec_intr_count;
u16 tpc_interrupt_id;
- u16 unexpected_user_error_interrupt_id;
+ u16 eq_interrupt_id;
u16 cache_line_size;
u16 server_type;
u8 completion_queues_count;
@@ -1211,15 +1211,15 @@ struct hl_eq {
/**
* struct hl_dec - describes a decoder sw instance.
* @hdev: pointer to the device structure.
- * @completion_abnrm_work: workqueue object to run when decoder generates an error interrupt
+ * @abnrm_intr_work: workqueue work item to run when decoder generates an error interrupt.
* @core_id: ID of the decoder.
* @base_addr: base address of the decoder.
*/
struct hl_dec {
- struct hl_device *hdev;
- struct work_struct completion_abnrm_work;
- u32 core_id;
- u32 base_addr;
+ struct hl_device *hdev;
+ struct work_struct abnrm_intr_work;
+ u32 core_id;
+ u32 base_addr;
};
/**
diff --git a/drivers/accel/habanalabs/common/irq.c b/drivers/accel/habanalabs/common/irq.c
index fab1abc5c910..c67895b1cdeb 100644
--- a/drivers/accel/habanalabs/common/irq.c
+++ b/drivers/accel/habanalabs/common/irq.c
@@ -415,8 +415,8 @@ irqreturn_t hl_irq_handler_eq(int irq, void *arg)
struct hl_eq_entry *eq_base;
struct hl_eqe_work *handle_eqe_work;
bool entry_ready;
- u32 cur_eqe;
- u16 cur_eqe_index;
+ u32 cur_eqe, ctl;
+ u16 cur_eqe_index, event_type;
eq_base = eq->kernel_address;
@@ -449,7 +449,10 @@ irqreturn_t hl_irq_handler_eq(int irq, void *arg)
dma_rmb();
if (hdev->disabled && !hdev->reset_info.in_compute_reset) {
- dev_warn(hdev->dev, "Device disabled but received an EQ event\n");
+ ctl = le32_to_cpu(eq_entry->hdr.ctl);
+ event_type = ((ctl & EQ_CTL_EVENT_TYPE_MASK) >> EQ_CTL_EVENT_TYPE_SHIFT);
+ dev_warn(hdev->dev,
+ "Device disabled but received an EQ event (%u)\n", event_type);
goto skip_irq;
}
@@ -486,7 +489,7 @@ irqreturn_t hl_irq_handler_dec_abnrm(int irq, void *arg)
{
struct hl_dec *dec = arg;
- schedule_work(&dec->completion_abnrm_work);
+ schedule_work(&dec->abnrm_intr_work);
return IRQ_HANDLED;
}
diff --git a/drivers/accel/habanalabs/common/memory.c b/drivers/accel/habanalabs/common/memory.c
index 17b79d717896..a7b6a273ce21 100644
--- a/drivers/accel/habanalabs/common/memory.c
+++ b/drivers/accel/habanalabs/common/memory.c
@@ -605,6 +605,7 @@ static u64 get_va_block(struct hl_device *hdev,
bool is_align_pow_2 = is_power_of_2(va_range->page_size);
bool is_hint_dram_addr = hl_is_dram_va(hdev, hint_addr);
bool force_hint = flags & HL_MEM_FORCE_HINT;
+ int rc;
if (is_align_pow_2)
align_mask = ~((u64)va_block_align - 1);
@@ -722,9 +723,13 @@ static u64 get_va_block(struct hl_device *hdev,
kfree(new_va_block);
}
- if (add_prev)
- add_va_block_locked(hdev, &va_range->list, prev_start,
- prev_end);
+ if (add_prev) {
+ rc = add_va_block_locked(hdev, &va_range->list, prev_start, prev_end);
+ if (rc) {
+ reserved_valid_start = 0;
+ goto out;
+ }
+ }
print_va_list_locked(hdev, &va_range->list);
out:
diff --git a/drivers/accel/habanalabs/common/mmu/mmu.c b/drivers/accel/habanalabs/common/mmu/mmu.c
index 17581b1bcc77..f379e5b461a6 100644
--- a/drivers/accel/habanalabs/common/mmu/mmu.c
+++ b/drivers/accel/habanalabs/common/mmu/mmu.c
@@ -679,7 +679,9 @@ int hl_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard, u32 flags)
rc = hdev->asic_funcs->mmu_invalidate_cache(hdev, is_hard, flags);
if (rc)
- dev_err_ratelimited(hdev->dev, "MMU cache invalidation failed\n");
+ dev_err_ratelimited(hdev->dev,
+ "%s cache invalidation failed, rc=%d\n",
+ flags == VM_TYPE_USERPTR ? "PMMU" : "HMMU", rc);
return rc;
}
@@ -692,7 +694,9 @@ int hl_mmu_invalidate_cache_range(struct hl_device *hdev, bool is_hard,
rc = hdev->asic_funcs->mmu_invalidate_cache_range(hdev, is_hard, flags,
asid, va, size);
if (rc)
- dev_err_ratelimited(hdev->dev, "MMU cache range invalidation failed\n");
+ dev_err_ratelimited(hdev->dev,
+ "%s cache range invalidation failed: va=%#llx, size=%llu, rc=%d",
+ flags == VM_TYPE_USERPTR ? "PMMU" : "HMMU", va, size, rc);
return rc;
}
diff --git a/drivers/accel/habanalabs/common/pci/pci.c b/drivers/accel/habanalabs/common/pci/pci.c
index d1f4c695baf2..191e0e3cf3a5 100644
--- a/drivers/accel/habanalabs/common/pci/pci.c
+++ b/drivers/accel/habanalabs/common/pci/pci.c
@@ -420,7 +420,6 @@ int hl_pci_init(struct hl_device *hdev)
unmap_pci_bars:
hl_pci_bars_unmap(hdev);
disable_device:
- pci_clear_master(pdev);
pci_disable_device(pdev);
return rc;
@@ -436,6 +435,5 @@ void hl_pci_fini(struct hl_device *hdev)
{
hl_pci_bars_unmap(hdev);
- pci_clear_master(hdev->pdev);
pci_disable_device(hdev->pdev);
}
diff --git a/drivers/accel/habanalabs/common/sysfs.c b/drivers/accel/habanalabs/common/sysfs.c
index 735d8bed0066..01f89f029355 100644
--- a/drivers/accel/habanalabs/common/sysfs.c
+++ b/drivers/accel/habanalabs/common/sysfs.c
@@ -497,10 +497,14 @@ int hl_sysfs_init(struct hl_device *hdev)
if (rc) {
dev_err(hdev->dev,
"Failed to add groups to device, error %d\n", rc);
- return rc;
+ goto remove_groups;
}
return 0;
+
+remove_groups:
+ device_remove_groups(hdev->dev, hl_dev_attr_groups);
+ return rc;
}
void hl_sysfs_fini(struct hl_device *hdev)
diff --git a/drivers/accel/habanalabs/gaudi/gaudi.c b/drivers/accel/habanalabs/gaudi/gaudi.c
index 08a4b1cf2b42..a29aa8f7b6f3 100644
--- a/drivers/accel/habanalabs/gaudi/gaudi.c
+++ b/drivers/accel/habanalabs/gaudi/gaudi.c
@@ -682,6 +682,9 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev)
prop->first_available_user_interrupt = USHRT_MAX;
prop->tpc_interrupt_id = USHRT_MAX;
+ /* single msi */
+ prop->eq_interrupt_id = 0;
+
for (i = 0 ; i < HL_MAX_DCORES ; i++)
prop->first_available_cq[i] = USHRT_MAX;
@@ -2017,38 +2020,6 @@ static int gaudi_enable_msi_single(struct hl_device *hdev)
return rc;
}
-static int gaudi_enable_msi_multi(struct hl_device *hdev)
-{
- int cq_cnt = hdev->asic_prop.completion_queues_count;
- int rc, i, irq_cnt_init, irq;
-
- for (i = 0, irq_cnt_init = 0 ; i < cq_cnt ; i++, irq_cnt_init++) {
- irq = gaudi_pci_irq_vector(hdev, i, false);
- rc = request_irq(irq, hl_irq_handler_cq, 0, gaudi_irq_name[i],
- &hdev->completion_queue[i]);
- if (rc) {
- dev_err(hdev->dev, "Failed to request IRQ %d", irq);
- goto free_irqs;
- }
- }
-
- irq = gaudi_pci_irq_vector(hdev, GAUDI_EVENT_QUEUE_MSI_IDX, true);
- rc = request_irq(irq, hl_irq_handler_eq, 0, gaudi_irq_name[cq_cnt],
- &hdev->event_queue);
- if (rc) {
- dev_err(hdev->dev, "Failed to request IRQ %d", irq);
- goto free_irqs;
- }
-
- return 0;
-
-free_irqs:
- for (i = 0 ; i < irq_cnt_init ; i++)
- free_irq(gaudi_pci_irq_vector(hdev, i, false),
- &hdev->completion_queue[i]);
- return rc;
-}
-
static int gaudi_enable_msi(struct hl_device *hdev)
{
struct gaudi_device *gaudi = hdev->asic_specific;
@@ -2063,14 +2034,7 @@ static int gaudi_enable_msi(struct hl_device *hdev)
return rc;
}
- if (rc < NUMBER_OF_INTERRUPTS) {
- gaudi->multi_msi_mode = false;
- rc = gaudi_enable_msi_single(hdev);
- } else {
- gaudi->multi_msi_mode = true;
- rc = gaudi_enable_msi_multi(hdev);
- }
-
+ rc = gaudi_enable_msi_single(hdev);
if (rc)
goto free_pci_irq_vectors;
@@ -2086,47 +2050,23 @@ free_pci_irq_vectors:
static void gaudi_sync_irqs(struct hl_device *hdev)
{
struct gaudi_device *gaudi = hdev->asic_specific;
- int i, cq_cnt = hdev->asic_prop.completion_queues_count;
if (!(gaudi->hw_cap_initialized & HW_CAP_MSI))
return;
/* Wait for all pending IRQs to be finished */
- if (gaudi->multi_msi_mode) {
- for (i = 0 ; i < cq_cnt ; i++)
- synchronize_irq(gaudi_pci_irq_vector(hdev, i, false));
-
- synchronize_irq(gaudi_pci_irq_vector(hdev,
- GAUDI_EVENT_QUEUE_MSI_IDX,
- true));
- } else {
- synchronize_irq(gaudi_pci_irq_vector(hdev, 0, false));
- }
+ synchronize_irq(gaudi_pci_irq_vector(hdev, 0, false));
}
static void gaudi_disable_msi(struct hl_device *hdev)
{
struct gaudi_device *gaudi = hdev->asic_specific;
- int i, irq, cq_cnt = hdev->asic_prop.completion_queues_count;
if (!(gaudi->hw_cap_initialized & HW_CAP_MSI))
return;
gaudi_sync_irqs(hdev);
-
- if (gaudi->multi_msi_mode) {
- irq = gaudi_pci_irq_vector(hdev, GAUDI_EVENT_QUEUE_MSI_IDX,
- true);
- free_irq(irq, &hdev->event_queue);
-
- for (i = 0 ; i < cq_cnt ; i++) {
- irq = gaudi_pci_irq_vector(hdev, i, false);
- free_irq(irq, &hdev->completion_queue[i]);
- }
- } else {
- free_irq(gaudi_pci_irq_vector(hdev, 0, false), hdev);
- }
-
+ free_irq(gaudi_pci_irq_vector(hdev, 0, false), hdev);
pci_free_irq_vectors(hdev->pdev);
gaudi->hw_cap_initialized &= ~HW_CAP_MSI;
@@ -3921,11 +3861,7 @@ static int gaudi_init_cpu_queues(struct hl_device *hdev, u32 cpu_timeout)
WREG32(mmCPU_IF_PF_PQ_PI, 0);
- if (gaudi->multi_msi_mode)
- WREG32(mmCPU_IF_QUEUE_INIT, PQ_INIT_STATUS_READY_FOR_CP);
- else
- WREG32(mmCPU_IF_QUEUE_INIT,
- PQ_INIT_STATUS_READY_FOR_CP_SINGLE_MSI);
+ WREG32(mmCPU_IF_QUEUE_INIT, PQ_INIT_STATUS_READY_FOR_CP_SINGLE_MSI);
irq_handler_offset = prop->gic_interrupts_enable ?
mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR :
@@ -5602,7 +5538,6 @@ static void gaudi_add_end_of_cb_packets(struct hl_device *hdev, void *kernel_add
u32 len, u32 original_len, u64 cq_addr, u32 cq_val,
u32 msi_vec, bool eb)
{
- struct gaudi_device *gaudi = hdev->asic_specific;
struct packet_msg_prot *cq_pkt;
struct packet_nop *cq_padding;
u64 msi_addr;
@@ -5632,12 +5567,7 @@ static void gaudi_add_end_of_cb_packets(struct hl_device *hdev, void *kernel_add
tmp |= FIELD_PREP(GAUDI_PKT_CTL_MB_MASK, 1);
cq_pkt->ctl = cpu_to_le32(tmp);
cq_pkt->value = cpu_to_le32(1);
-
- if (gaudi->multi_msi_mode)
- msi_addr = mmPCIE_MSI_INTR_0 + msi_vec * 4;
- else
- msi_addr = mmPCIE_CORE_MSI_REQ;
-
+ msi_addr = hdev->pdev ? mmPCIE_CORE_MSI_REQ : mmPCIE_MSI_INTR_0 + msi_vec * 4;
cq_pkt->addr = cpu_to_le64(CFG_BASE + msi_addr);
}
diff --git a/drivers/accel/habanalabs/gaudi/gaudiP.h b/drivers/accel/habanalabs/gaudi/gaudiP.h
index 3d88d56c8eb3..b8fa724be5a1 100644
--- a/drivers/accel/habanalabs/gaudi/gaudiP.h
+++ b/drivers/accel/habanalabs/gaudi/gaudiP.h
@@ -28,20 +28,8 @@
#define NUMBER_OF_COLLECTIVE_QUEUES 12
#define NUMBER_OF_SOBS_IN_GRP 11
-/*
- * Number of MSI interrupts IDS:
- * Each completion queue has 1 ID
- * The event queue has 1 ID
- */
-#define NUMBER_OF_INTERRUPTS (NUMBER_OF_CMPLT_QUEUES + \
- NUMBER_OF_CPU_HW_QUEUES)
-
#define GAUDI_STREAM_MASTER_ARR_SIZE 8
-#if (NUMBER_OF_INTERRUPTS > GAUDI_MSI_ENTRIES)
-#error "Number of MSI interrupts must be smaller or equal to GAUDI_MSI_ENTRIES"
-#endif
-
#define CORESIGHT_TIMEOUT_USEC 100000 /* 100 ms */
#define GAUDI_MAX_CLK_FREQ 2200000000ull /* 2200 MHz */
@@ -324,8 +312,6 @@ struct gaudi_internal_qman_info {
* signal we can use this engine in later code paths.
* Each bit is cleared upon reset of its corresponding H/W
* engine.
- * @multi_msi_mode: whether we are working in multi MSI single MSI mode.
- * Multi MSI is possible only with IOMMU enabled.
* @mmu_cache_inv_pi: PI for MMU cache invalidation flow. The H/W expects an
* 8-bit value so use u8.
*/
@@ -345,7 +331,6 @@ struct gaudi_device {
u32 events_stat[GAUDI_EVENT_SIZE];
u32 events_stat_aggregate[GAUDI_EVENT_SIZE];
u32 hw_cap_initialized;
- u8 multi_msi_mode;
u8 mmu_cache_inv_pi;
};
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index edcbda3d9b40..b778cf764a68 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -2112,6 +2112,7 @@ static bool gaudi2_get_mme_idle_status(struct hl_device *hdev, u64 *mask_arr, u8
static bool gaudi2_get_edma_idle_status(struct hl_device *hdev, u64 *mask_arr, u8 mask_len,
struct engines_data *e);
static u64 gaudi2_mmu_scramble_addr(struct hl_device *hdev, u64 raw_addr);
+static u64 gaudi2_mmu_descramble_addr(struct hl_device *hdev, u64 scrambled_addr);
static void gaudi2_init_scrambler_hbm(struct hl_device *hdev)
{
@@ -2438,7 +2439,7 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev)
prop->first_available_user_interrupt = GAUDI2_IRQ_NUM_USER_FIRST;
prop->tpc_interrupt_id = GAUDI2_IRQ_NUM_TPC_ASSERT;
- prop->unexpected_user_error_interrupt_id = GAUDI2_IRQ_NUM_UNEXPECTED_ERROR;
+ prop->eq_interrupt_id = GAUDI2_IRQ_NUM_EVENT_QUEUE;
prop->first_available_cq[0] = GAUDI2_RESERVED_CQ_NUMBER;
@@ -2887,6 +2888,10 @@ static int gaudi2_cpucp_info_get(struct hl_device *hdev)
hdev->tpc_binning = le64_to_cpu(prop->cpucp_info.tpc_binning_mask);
hdev->decoder_binning = lower_32_bits(le64_to_cpu(prop->cpucp_info.decoder_binning_mask));
+ dev_dbg(hdev->dev, "Read binning masks: tpc: 0x%llx, dram: 0x%llx, edma: 0x%x, dec: 0x%x\n",
+ hdev->tpc_binning, hdev->dram_binning, hdev->edma_binning,
+ hdev->decoder_binning);
+
/*
* at this point the DRAM parameters need to be updated according to data obtained
* from the FW
@@ -3345,7 +3350,7 @@ static void gaudi2_user_interrupt_setup(struct hl_device *hdev)
/* Initialize TPC interrupt */
HL_USR_INTR_STRUCT_INIT(hdev->tpc_interrupt, hdev, 0, HL_USR_INTERRUPT_TPC);
- /* Initialize general purpose interrupt */
+ /* Initialize unexpected error interrupt */
HL_USR_INTR_STRUCT_INIT(hdev->unexpected_error_interrupt, hdev, 0,
HL_USR_INTERRUPT_UNEXPECTED);
@@ -3475,6 +3480,48 @@ static int gaudi2_special_blocks_iterator_config(struct hl_device *hdev)
return gaudi2_special_blocks_config(hdev);
}
+static void gaudi2_test_queues_msgs_free(struct hl_device *hdev)
+{
+ struct gaudi2_device *gaudi2 = hdev->asic_specific;
+ struct gaudi2_queues_test_info *msg_info = gaudi2->queues_test_info;
+ int i;
+
+ for (i = 0 ; i < GAUDI2_NUM_TESTED_QS ; i++) {
+ /* bail-out if this is an allocation failure point */
+ if (!msg_info[i].kern_addr)
+ break;
+
+ hl_asic_dma_pool_free(hdev, msg_info[i].kern_addr, msg_info[i].dma_addr);
+ msg_info[i].kern_addr = NULL;
+ }
+}
+
+static int gaudi2_test_queues_msgs_alloc(struct hl_device *hdev)
+{
+ struct gaudi2_device *gaudi2 = hdev->asic_specific;
+ struct gaudi2_queues_test_info *msg_info = gaudi2->queues_test_info;
+ int i, rc;
+
+ /* allocate a message-short buf for each Q we intend to test */
+ for (i = 0 ; i < GAUDI2_NUM_TESTED_QS ; i++) {
+ msg_info[i].kern_addr =
+ (void *)hl_asic_dma_pool_zalloc(hdev, sizeof(struct packet_msg_short),
+ GFP_KERNEL, &msg_info[i].dma_addr);
+ if (!msg_info[i].kern_addr) {
+ dev_err(hdev->dev,
+ "Failed to allocate dma memory for H/W queue %d testing\n", i);
+ rc = -ENOMEM;
+ goto err_exit;
+ }
+ }
+
+ return 0;
+
+err_exit:
+ gaudi2_test_queues_msgs_free(hdev);
+ return rc;
+}
+
static int gaudi2_sw_init(struct hl_device *hdev)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
@@ -3574,8 +3621,14 @@ static int gaudi2_sw_init(struct hl_device *hdev)
if (rc)
goto free_scratchpad_mem;
+ rc = gaudi2_test_queues_msgs_alloc(hdev);
+ if (rc)
+ goto special_blocks_free;
+
return 0;
+special_blocks_free:
+ gaudi2_special_blocks_iterator_free(hdev);
free_scratchpad_mem:
hl_asic_dma_pool_free(hdev, gaudi2->scratchpad_kernel_address,
gaudi2->scratchpad_bus_address);
@@ -3598,6 +3651,8 @@ static int gaudi2_sw_fini(struct hl_device *hdev)
struct asic_fixed_properties *prop = &hdev->asic_prop;
struct gaudi2_device *gaudi2 = hdev->asic_specific;
+ gaudi2_test_queues_msgs_free(hdev);
+
gaudi2_special_blocks_iterator_free(hdev);
hl_cpu_accessible_dma_pool_free(hdev, prop->pmmu.page_size, gaudi2->virt_msix_db_cpu_addr);
@@ -4009,7 +4064,7 @@ static const char *gaudi2_irq_name(u16 irq_number)
case GAUDI2_IRQ_NUM_TPC_ASSERT:
return "gaudi2 tpc assert";
case GAUDI2_IRQ_NUM_UNEXPECTED_ERROR:
- return "gaudi2 tpc assert";
+ return "gaudi2 unexpected error";
case GAUDI2_IRQ_NUM_USER_FIRST ... GAUDI2_IRQ_NUM_USER_LAST:
return "gaudi2 user completion";
default:
@@ -6792,28 +6847,29 @@ static void gaudi2_qman_set_test_mode(struct hl_device *hdev, u32 hw_queue_id, b
}
}
-static int gaudi2_test_queue(struct hl_device *hdev, u32 hw_queue_id)
+static inline u32 gaudi2_test_queue_hw_queue_id_to_sob_id(struct hl_device *hdev, u32 hw_queue_id)
{
- u32 sob_offset = hdev->asic_prop.first_available_user_sob[0] * 4;
+ return hdev->asic_prop.first_available_user_sob[0] +
+ hw_queue_id - GAUDI2_QUEUE_ID_PDMA_0_0;
+}
+
+static void gaudi2_test_queue_clear(struct hl_device *hdev, u32 hw_queue_id)
+{
+ u32 sob_offset = gaudi2_test_queue_hw_queue_id_to_sob_id(hdev, hw_queue_id) * 4;
u32 sob_addr = mmDCORE0_SYNC_MNGR_OBJS_SOB_OBJ_0 + sob_offset;
- u32 timeout_usec, tmp, sob_base = 1, sob_val = 0x5a5a;
- struct packet_msg_short *msg_short_pkt;
- dma_addr_t pkt_dma_addr;
- size_t pkt_size;
- int rc;
- if (hdev->pldm)
- timeout_usec = GAUDI2_PLDM_TEST_QUEUE_WAIT_USEC;
- else
- timeout_usec = GAUDI2_TEST_QUEUE_WAIT_USEC;
+ /* Reset the SOB value */
+ WREG32(sob_addr, 0);
+}
- pkt_size = sizeof(*msg_short_pkt);
- msg_short_pkt = hl_asic_dma_pool_zalloc(hdev, pkt_size, GFP_KERNEL, &pkt_dma_addr);
- if (!msg_short_pkt) {
- dev_err(hdev->dev, "Failed to allocate packet for H/W queue %d testing\n",
- hw_queue_id);
- return -ENOMEM;
- }
+static int gaudi2_test_queue_send_msg_short(struct hl_device *hdev, u32 hw_queue_id, u32 sob_val,
+ struct gaudi2_queues_test_info *msg_info)
+{
+ u32 sob_offset = gaudi2_test_queue_hw_queue_id_to_sob_id(hdev, hw_queue_id) * 4;
+ u32 tmp, sob_base = 1;
+ struct packet_msg_short *msg_short_pkt = msg_info->kern_addr;
+ size_t pkt_size = sizeof(struct packet_msg_short);
+ int rc;
tmp = (PACKET_MSG_SHORT << GAUDI2_PKT_CTL_OPCODE_SHIFT) |
(1 << GAUDI2_PKT_CTL_EB_SHIFT) |
@@ -6824,15 +6880,25 @@ static int gaudi2_test_queue(struct hl_device *hdev, u32 hw_queue_id)
msg_short_pkt->value = cpu_to_le32(sob_val);
msg_short_pkt->ctl = cpu_to_le32(tmp);
- /* Reset the SOB value */
- WREG32(sob_addr, 0);
+ rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id, pkt_size, msg_info->dma_addr);
+ if (rc)
+ dev_err(hdev->dev,
+ "Failed to send msg_short packet to H/W queue %d\n", hw_queue_id);
- rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id, pkt_size, pkt_dma_addr);
- if (rc) {
- dev_err(hdev->dev, "Failed to send msg_short packet to H/W queue %d\n",
- hw_queue_id);
- goto free_pkt;
- }
+ return rc;
+}
+
+static int gaudi2_test_queue_wait_completion(struct hl_device *hdev, u32 hw_queue_id, u32 sob_val)
+{
+ u32 sob_offset = gaudi2_test_queue_hw_queue_id_to_sob_id(hdev, hw_queue_id) * 4;
+ u32 sob_addr = mmDCORE0_SYNC_MNGR_OBJS_SOB_OBJ_0 + sob_offset;
+ u32 timeout_usec, tmp;
+ int rc;
+
+ if (hdev->pldm)
+ timeout_usec = GAUDI2_PLDM_TEST_QUEUE_WAIT_USEC;
+ else
+ timeout_usec = GAUDI2_TEST_QUEUE_WAIT_USEC;
rc = hl_poll_timeout(
hdev,
@@ -6848,11 +6914,6 @@ static int gaudi2_test_queue(struct hl_device *hdev, u32 hw_queue_id)
rc = -EIO;
}
- /* Reset the SOB value */
- WREG32(sob_addr, 0);
-
-free_pkt:
- hl_asic_dma_pool_free(hdev, (void *) msg_short_pkt, pkt_dma_addr);
return rc;
}
@@ -6872,30 +6933,44 @@ static int gaudi2_test_cpu_queue(struct hl_device *hdev)
static int gaudi2_test_queues(struct hl_device *hdev)
{
- int i, rc, ret_val = 0;
+ struct gaudi2_device *gaudi2 = hdev->asic_specific;
+ struct gaudi2_queues_test_info *msg_info;
+ u32 sob_val = 0x5a5a;
+ int i, rc;
+ /* send test message on all enabled Qs */
for (i = GAUDI2_QUEUE_ID_PDMA_0_0 ; i < GAUDI2_QUEUE_ID_CPU_PQ; i++) {
if (!gaudi2_is_queue_enabled(hdev, i))
continue;
+ msg_info = &gaudi2->queues_test_info[i - GAUDI2_QUEUE_ID_PDMA_0_0];
gaudi2_qman_set_test_mode(hdev, i, true);
- rc = gaudi2_test_queue(hdev, i);
- gaudi2_qman_set_test_mode(hdev, i, false);
-
- if (rc) {
- ret_val = -EINVAL;
+ gaudi2_test_queue_clear(hdev, i);
+ rc = gaudi2_test_queue_send_msg_short(hdev, i, sob_val, msg_info);
+ if (rc)
goto done;
- }
}
rc = gaudi2_test_cpu_queue(hdev);
- if (rc) {
- ret_val = -EINVAL;
+ if (rc)
goto done;
+
+ /* verify that all messages were processed */
+ for (i = GAUDI2_QUEUE_ID_PDMA_0_0 ; i < GAUDI2_QUEUE_ID_CPU_PQ; i++) {
+ if (!gaudi2_is_queue_enabled(hdev, i))
+ continue;
+
+ rc = gaudi2_test_queue_wait_completion(hdev, i, sob_val);
+ if (rc)
+ /* chip is not usable, no need for cleanups, just bail-out with error */
+ goto done;
+
+ gaudi2_test_queue_clear(hdev, i);
+ gaudi2_qman_set_test_mode(hdev, i, false);
}
done:
- return ret_val;
+ return rc;
}
static int gaudi2_compute_reset_late_init(struct hl_device *hdev)
@@ -8485,23 +8560,28 @@ static int gaudi2_handle_qman_err(struct hl_device *hdev, u16 event_type, u64 *e
static int gaudi2_handle_arc_farm_sei_err(struct hl_device *hdev, u16 event_type)
{
- u32 i, sts_val, sts_clr_val = 0, error_count = 0;
+ u32 i, sts_val, sts_clr_val, error_count = 0, arc_farm;
- sts_val = RREG32(mmARC_FARM_ARC0_AUX_ARC_SEI_INTR_STS);
+ for (arc_farm = 0 ; arc_farm < NUM_OF_ARC_FARMS_ARC ; arc_farm++) {
+ sts_clr_val = 0;
+ sts_val = RREG32(mmARC_FARM_ARC0_AUX_ARC_SEI_INTR_STS +
+ (arc_farm * ARC_FARM_OFFSET));
- for (i = 0 ; i < GAUDI2_NUM_OF_ARC_SEI_ERR_CAUSE ; i++) {
- if (sts_val & BIT(i)) {
- gaudi2_print_event(hdev, event_type, true,
- "err cause: %s", gaudi2_arc_sei_error_cause[i]);
- sts_clr_val |= BIT(i);
- error_count++;
+ for (i = 0 ; i < GAUDI2_NUM_OF_ARC_SEI_ERR_CAUSE ; i++) {
+ if (sts_val & BIT(i)) {
+ gaudi2_print_event(hdev, event_type, true,
+ "ARC FARM ARC %u err cause: %s",
+ arc_farm, gaudi2_arc_sei_error_cause[i]);
+ sts_clr_val |= BIT(i);
+ error_count++;
+ }
}
+ WREG32(mmARC_FARM_ARC0_AUX_ARC_SEI_INTR_CLR + (arc_farm * ARC_FARM_OFFSET),
+ sts_clr_val);
}
hl_check_for_glbl_errors(hdev);
- WREG32(mmARC_FARM_ARC0_AUX_ARC_SEI_INTR_CLR, sts_clr_val);
-
return error_count;
}
@@ -8844,7 +8924,7 @@ static int gaudi2_handle_hif_fatal(struct hl_device *hdev, u16 event_type, u64 i
static void gaudi2_handle_page_error(struct hl_device *hdev, u64 mmu_base, bool is_pmmu,
u64 *event_mask)
{
- u32 valid, val, axid_l, axid_h;
+ u32 valid, val;
u64 addr;
valid = RREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_ACCESS_PAGE_ERROR_VALID));
@@ -8857,11 +8937,11 @@ static void gaudi2_handle_page_error(struct hl_device *hdev, u64 mmu_base, bool
addr <<= 32;
addr |= RREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_PAGE_ERROR_CAPTURE_VA));
- axid_l = RREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_PAGE_FAULT_ID_LSB));
- axid_h = RREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_PAGE_FAULT_ID_MSB));
+ if (!is_pmmu)
+ addr = gaudi2_mmu_descramble_addr(hdev, addr);
- dev_err_ratelimited(hdev->dev, "%s page fault on va 0x%llx, transaction id 0x%llX\n",
- is_pmmu ? "PMMU" : "HMMU", addr, ((u64)axid_h << 32) + axid_l);
+ dev_err_ratelimited(hdev->dev, "%s page fault on va 0x%llx\n",
+ is_pmmu ? "PMMU" : "HMMU", addr);
hl_handle_page_fault(hdev, addr, 0, is_pmmu, event_mask);
WREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_ACCESS_PAGE_ERROR_VALID), 0);
@@ -8882,9 +8962,12 @@ static void gaudi2_handle_access_error(struct hl_device *hdev, u64 mmu_base, boo
addr <<= 32;
addr |= RREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_ACCESS_ERROR_CAPTURE_VA));
+ if (!is_pmmu)
+ addr = gaudi2_mmu_descramble_addr(hdev, addr);
+
dev_err_ratelimited(hdev->dev, "%s access error on va 0x%llx\n",
is_pmmu ? "PMMU" : "HMMU", addr);
- WREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_ACCESS_ERROR_CAPTURE), 0);
+ WREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_ACCESS_PAGE_ERROR_VALID), 0);
}
static int gaudi2_handle_mmu_spi_sei_generic(struct hl_device *hdev, u16 event_type,
@@ -8976,46 +9059,110 @@ static int gaudi2_handle_sm_err(struct hl_device *hdev, u16 event_type, u8 sm_in
return error_count;
}
+static u64 get_hmmu_base(u16 event_type)
+{
+ u8 dcore, index_in_dcore;
+
+ switch (event_type) {
+ case GAUDI2_EVENT_HMMU_0_AXI_ERR_RSP:
+ case GAUDI2_EVENT_HMMU0_SPI_BASE ... GAUDI2_EVENT_HMMU0_SECURITY_ERROR:
+ dcore = 0;
+ index_in_dcore = 0;
+ break;
+ case GAUDI2_EVENT_HMMU_1_AXI_ERR_RSP:
+ case GAUDI2_EVENT_HMMU1_SPI_BASE ... GAUDI2_EVENT_HMMU1_SECURITY_ERROR:
+ dcore = 1;
+ index_in_dcore = 0;
+ break;
+ case GAUDI2_EVENT_HMMU_2_AXI_ERR_RSP:
+ case GAUDI2_EVENT_HMMU2_SPI_BASE ... GAUDI2_EVENT_HMMU2_SECURITY_ERROR:
+ dcore = 0;
+ index_in_dcore = 1;
+ break;
+ case GAUDI2_EVENT_HMMU_3_AXI_ERR_RSP:
+ case GAUDI2_EVENT_HMMU3_SPI_BASE ... GAUDI2_EVENT_HMMU3_SECURITY_ERROR:
+ dcore = 1;
+ index_in_dcore = 1;
+ break;
+ case GAUDI2_EVENT_HMMU_4_AXI_ERR_RSP:
+ case GAUDI2_EVENT_HMMU4_SPI_BASE ... GAUDI2_EVENT_HMMU4_SECURITY_ERROR:
+ dcore = 3;
+ index_in_dcore = 2;
+ break;
+ case GAUDI2_EVENT_HMMU_5_AXI_ERR_RSP:
+ case GAUDI2_EVENT_HMMU5_SPI_BASE ... GAUDI2_EVENT_HMMU5_SECURITY_ERROR:
+ dcore = 2;
+ index_in_dcore = 2;
+ break;
+ case GAUDI2_EVENT_HMMU_6_AXI_ERR_RSP:
+ case GAUDI2_EVENT_HMMU6_SPI_BASE ... GAUDI2_EVENT_HMMU6_SECURITY_ERROR:
+ dcore = 3;
+ index_in_dcore = 3;
+ break;
+ case GAUDI2_EVENT_HMMU_7_AXI_ERR_RSP:
+ case GAUDI2_EVENT_HMMU7_SPI_BASE ... GAUDI2_EVENT_HMMU7_SECURITY_ERROR:
+ dcore = 2;
+ index_in_dcore = 3;
+ break;
+ case GAUDI2_EVENT_HMMU_8_AXI_ERR_RSP:
+ case GAUDI2_EVENT_HMMU8_SPI_BASE ... GAUDI2_EVENT_HMMU8_SECURITY_ERROR:
+ dcore = 0;
+ index_in_dcore = 2;
+ break;
+ case GAUDI2_EVENT_HMMU_9_AXI_ERR_RSP:
+ case GAUDI2_EVENT_HMMU9_SPI_BASE ... GAUDI2_EVENT_HMMU9_SECURITY_ERROR:
+ dcore = 1;
+ index_in_dcore = 2;
+ break;
+ case GAUDI2_EVENT_HMMU_10_AXI_ERR_RSP:
+ case GAUDI2_EVENT_HMMU10_SPI_BASE ... GAUDI2_EVENT_HMMU10_SECURITY_ERROR:
+ dcore = 0;
+ index_in_dcore = 3;
+ break;
+ case GAUDI2_EVENT_HMMU_11_AXI_ERR_RSP:
+ case GAUDI2_EVENT_HMMU11_SPI_BASE ... GAUDI2_EVENT_HMMU11_SECURITY_ERROR:
+ dcore = 1;
+ index_in_dcore = 3;
+ break;
+ case GAUDI2_EVENT_HMMU_12_AXI_ERR_RSP:
+ case GAUDI2_EVENT_HMMU12_SPI_BASE ... GAUDI2_EVENT_HMMU12_SECURITY_ERROR:
+ dcore = 3;
+ index_in_dcore = 0;
+ break;
+ case GAUDI2_EVENT_HMMU_13_AXI_ERR_RSP:
+ case GAUDI2_EVENT_HMMU13_SPI_BASE ... GAUDI2_EVENT_HMMU13_SECURITY_ERROR:
+ dcore = 2;
+ index_in_dcore = 0;
+ break;
+ case GAUDI2_EVENT_HMMU_14_AXI_ERR_RSP:
+ case GAUDI2_EVENT_HMMU14_SPI_BASE ... GAUDI2_EVENT_HMMU14_SECURITY_ERROR:
+ dcore = 3;
+ index_in_dcore = 1;
+ break;
+ case GAUDI2_EVENT_HMMU_15_AXI_ERR_RSP:
+ case GAUDI2_EVENT_HMMU15_SPI_BASE ... GAUDI2_EVENT_HMMU15_SECURITY_ERROR:
+ dcore = 2;
+ index_in_dcore = 1;
+ break;
+ default:
+ return ULONG_MAX;
+ }
+
+ return mmDCORE0_HMMU0_MMU_BASE + dcore * DCORE_OFFSET + index_in_dcore * DCORE_HMMU_OFFSET;
+}
+
static int gaudi2_handle_mmu_spi_sei_err(struct hl_device *hdev, u16 event_type, u64 *event_mask)
{
bool is_pmmu = false;
u32 error_count = 0;
u64 mmu_base;
- u8 index;
switch (event_type) {
- case GAUDI2_EVENT_HMMU0_PAGE_FAULT_OR_WR_PERM ... GAUDI2_EVENT_HMMU3_SECURITY_ERROR:
- index = (event_type - GAUDI2_EVENT_HMMU0_PAGE_FAULT_OR_WR_PERM) / 3;
- mmu_base = mmDCORE0_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET;
- break;
- case GAUDI2_EVENT_HMMU_0_AXI_ERR_RSP ... GAUDI2_EVENT_HMMU_3_AXI_ERR_RSP:
- index = (event_type - GAUDI2_EVENT_HMMU_0_AXI_ERR_RSP);
- mmu_base = mmDCORE0_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET;
- break;
- case GAUDI2_EVENT_HMMU8_PAGE_FAULT_WR_PERM ... GAUDI2_EVENT_HMMU11_SECURITY_ERROR:
- index = (event_type - GAUDI2_EVENT_HMMU8_PAGE_FAULT_WR_PERM) / 3;
- mmu_base = mmDCORE1_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET;
- break;
- case GAUDI2_EVENT_HMMU_8_AXI_ERR_RSP ... GAUDI2_EVENT_HMMU_11_AXI_ERR_RSP:
- index = (event_type - GAUDI2_EVENT_HMMU_8_AXI_ERR_RSP);
- mmu_base = mmDCORE1_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET;
- break;
- case GAUDI2_EVENT_HMMU7_PAGE_FAULT_WR_PERM ... GAUDI2_EVENT_HMMU4_SECURITY_ERROR:
- index = (event_type - GAUDI2_EVENT_HMMU7_PAGE_FAULT_WR_PERM) / 3;
- mmu_base = mmDCORE2_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET;
- break;
- case GAUDI2_EVENT_HMMU_7_AXI_ERR_RSP ... GAUDI2_EVENT_HMMU_4_AXI_ERR_RSP:
- index = (event_type - GAUDI2_EVENT_HMMU_7_AXI_ERR_RSP);
- mmu_base = mmDCORE2_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET;
- break;
- case GAUDI2_EVENT_HMMU15_PAGE_FAULT_WR_PERM ... GAUDI2_EVENT_HMMU12_SECURITY_ERROR:
- index = (event_type - GAUDI2_EVENT_HMMU15_PAGE_FAULT_WR_PERM) / 3;
- mmu_base = mmDCORE3_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET;
- break;
- case GAUDI2_EVENT_HMMU_15_AXI_ERR_RSP ... GAUDI2_EVENT_HMMU_12_AXI_ERR_RSP:
- index = (event_type - GAUDI2_EVENT_HMMU_15_AXI_ERR_RSP);
- mmu_base = mmDCORE3_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET;
+ case GAUDI2_EVENT_HMMU_0_AXI_ERR_RSP ... GAUDI2_EVENT_HMMU_12_AXI_ERR_RSP:
+ case GAUDI2_EVENT_HMMU0_SPI_BASE ... GAUDI2_EVENT_HMMU12_SECURITY_ERROR:
+ mmu_base = get_hmmu_base(event_type);
break;
+
case GAUDI2_EVENT_PMMU0_PAGE_FAULT_WR_PERM ... GAUDI2_EVENT_PMMU0_SECURITY_ERROR:
case GAUDI2_EVENT_PMMU_AXI_ERR_RSP_0:
is_pmmu = true;
@@ -9025,6 +9172,9 @@ static int gaudi2_handle_mmu_spi_sei_err(struct hl_device *hdev, u16 event_type,
return 0;
}
+ if (mmu_base == ULONG_MAX)
+ return 0;
+
error_count = gaudi2_handle_mmu_spi_sei_generic(hdev, event_type, mmu_base,
is_pmmu, event_mask);
hl_check_for_glbl_errors(hdev);
@@ -9435,19 +9585,18 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
break;
case GAUDI2_EVENT_ARC_AXI_ERROR_RESPONSE_0:
- reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
error_count = gaudi2_handle_arc_farm_sei_err(hdev, event_type);
- event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
+ event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
break;
case GAUDI2_EVENT_CPU_AXI_ERR_RSP:
error_count = gaudi2_handle_cpu_sei_err(hdev, event_type);
- event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
+ reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
+ event_mask |= HL_NOTIFIER_EVENT_CRITICL_FW_ERR;
break;
case GAUDI2_EVENT_PDMA_CH0_AXI_ERR_RSP:
case GAUDI2_EVENT_PDMA_CH1_AXI_ERR_RSP:
- reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
error_count = gaudi2_handle_qm_sei_err(hdev, event_type, true, &event_mask);
event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
break;
@@ -9634,12 +9783,14 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
case GAUDI2_EVENT_PCIE_DRAIN_COMPLETE:
error_count = gaudi2_handle_pcie_drain(hdev, &eq_entry->pcie_drain_ind_data);
+ reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
break;
case GAUDI2_EVENT_PSOC59_RPM_ERROR_OR_DRAIN:
error_count = gaudi2_handle_psoc_drain(hdev,
le64_to_cpu(eq_entry->intr_cause.intr_cause_data));
+ reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
break;
@@ -9668,6 +9819,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
break;
case GAUDI2_EVENT_PSOC_AXI_ERR_RSP:
error_count = GAUDI2_NA_EVENT_CAUSE;
+ reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
break;
case GAUDI2_EVENT_PSOC_PRSTN_FALL:
@@ -9681,6 +9833,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
break;
case GAUDI2_EVENT_PCIE_FATAL_ERR:
error_count = GAUDI2_NA_EVENT_CAUSE;
+ reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
break;
case GAUDI2_EVENT_TPC0_BMON_SPMU:
@@ -9748,6 +9901,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
case GAUDI2_EVENT_CPU_PKT_QUEUE_OUT_SYNC:
gaudi2_print_out_of_sync_info(hdev, event_type, &eq_entry->pkt_sync_err);
error_count = GAUDI2_NA_EVENT_CAUSE;
+ reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
break;
@@ -9789,6 +9943,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
case GAUDI2_EVENT_CPU_PKT_SANITY_FAILED:
gaudi2_print_cpu_pkt_failure_info(hdev, event_type, &eq_entry->pkt_sync_err);
error_count = GAUDI2_NA_EVENT_CAUSE;
+ reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
break;
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2P.h b/drivers/accel/habanalabs/gaudi2/gaudi2P.h
index 0742046810f9..1cebe707772e 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2P.h
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2P.h
@@ -240,6 +240,8 @@
#define GAUDI2_SOB_INCREMENT_BY_ONE (FIELD_PREP(DCORE0_SYNC_MNGR_OBJS_SOB_OBJ_VAL_MASK, 1) | \
FIELD_PREP(DCORE0_SYNC_MNGR_OBJS_SOB_OBJ_INC_MASK, 1))
+#define GAUDI2_NUM_TESTED_QS (GAUDI2_QUEUE_ID_CPU_PQ - GAUDI2_QUEUE_ID_PDMA_0_0)
+
#define GAUDI2_NUM_OF_GLBL_ERR_CAUSE 8
enum gaudi2_reserved_sob_id {
@@ -453,6 +455,17 @@ struct dup_block_ctx {
};
/**
+ * struct gaudi2_queues_test_info - Holds the address of a the messages used for testing the
+ * device queues.
+ * @dma_addr: the address used by the HW for accessing the message.
+ * @kern_addr: The address used by the driver for accessing the message.
+ */
+struct gaudi2_queues_test_info {
+ dma_addr_t dma_addr;
+ void *kern_addr;
+};
+
+/**
* struct gaudi2_device - ASIC specific manage structure.
* @cpucp_info_get: get information on device from CPU-CP
* @mapped_blocks: array that holds the base address and size of all blocks
@@ -510,6 +523,7 @@ struct dup_block_ctx {
* @flush_db_fifo: flag to force flush DB FIFO after a write.
* @hbm_cfg: HBM subsystem settings
* @hw_queues_lock_mutex: used by simulator instead of hw_queues_lock.
+ * @queues_test_info: information used by the driver when testing the HW queues.
*/
struct gaudi2_device {
int (*cpucp_info_get)(struct hl_device *hdev);
@@ -537,6 +551,9 @@ struct gaudi2_device {
u32 events_stat[GAUDI2_EVENT_SIZE];
u32 events_stat_aggregate[GAUDI2_EVENT_SIZE];
u32 num_of_valid_hw_events;
+
+ /* Queue testing */
+ struct gaudi2_queues_test_info queues_test_info[GAUDI2_NUM_TESTED_QS];
};
/*
diff --git a/drivers/accel/habanalabs/goya/goya.c b/drivers/accel/habanalabs/goya/goya.c
index 07d67878eac5..fb0ac9df841a 100644
--- a/drivers/accel/habanalabs/goya/goya.c
+++ b/drivers/accel/habanalabs/goya/goya.c
@@ -473,6 +473,7 @@ int goya_set_fixed_properties(struct hl_device *hdev)
prop->first_available_user_interrupt = USHRT_MAX;
prop->tpc_interrupt_id = USHRT_MAX;
+ prop->eq_interrupt_id = GOYA_EVENT_QUEUE_MSIX_IDX;
for (i = 0 ; i < HL_MAX_DCORES ; i++)
prop->first_available_cq[i] = USHRT_MAX;
diff --git a/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h b/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h
index 452b379f39f6..6c58af614236 100644
--- a/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h
+++ b/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0
*
- * Copyright 2020-2022 HabanaLabs, Ltd.
+ * Copyright 2020-2023 HabanaLabs, Ltd.
* All Rights Reserved.
*
*/
@@ -543,6 +543,8 @@
#define HBM_MC_SPI_IEEE1500_COMP_MASK BIT(3)
#define HBM_MC_SPI_IEEE1500_PAUSED_MASK BIT(4)
+#define ARC_FARM_OFFSET (mmARC_FARM_ARC1_AUX_BASE - mmARC_FARM_ARC0_AUX_BASE)
+
#include "nic0_qpc0_regs.h"
#include "nic0_qm0_regs.h"
#include "nic0_qm_arc_aux0_regs.h"
diff --git a/include/uapi/drm/habanalabs_accel.h b/include/uapi/drm/habanalabs_accel.h
index c139aab17c8a..d9ef1b151d04 100644
--- a/include/uapi/drm/habanalabs_accel.h
+++ b/include/uapi/drm/habanalabs_accel.h
@@ -708,7 +708,8 @@ enum hl_server_type {
HL_SERVER_GAUDI_HLS1H = 2,
HL_SERVER_GAUDI_TYPE1 = 3,
HL_SERVER_GAUDI_TYPE2 = 4,
- HL_SERVER_GAUDI2_HLS2 = 5
+ HL_SERVER_GAUDI2_HLS2 = 5,
+ HL_SERVER_GAUDI2_TYPE1 = 7
};
/*