diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2023-04-11 14:37:20 +1000 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2023-04-11 14:37:20 +1000 |
commit | 2909cca706bf8c6dd62f9c3b14e98a46e3ed3f42 (patch) | |
tree | 9ee11f3af04fe11c897f3235f8a0b57d5f9d6810 | |
parent | dea3215f33f2ba4e29046036d13de1df9c18b2da (diff) | |
parent | 56499c461589634f2c89ffbd9cfb78268191d349 (diff) | |
download | linux-next-2909cca706bf8c6dd62f9c3b14e98a46e3ed3f42.tar.gz |
Merge branch 'habanalabs-next' of git://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux.git
-rw-r--r-- | drivers/accel/habanalabs/common/command_buffer.c | 15 | ||||
-rw-r--r-- | drivers/accel/habanalabs/common/decoder.c | 40 | ||||
-rw-r--r-- | drivers/accel/habanalabs/common/device.c | 54 | ||||
-rw-r--r-- | drivers/accel/habanalabs/common/firmware_if.c | 17 | ||||
-rw-r--r-- | drivers/accel/habanalabs/common/habanalabs.h | 14 | ||||
-rw-r--r-- | drivers/accel/habanalabs/common/irq.c | 11 | ||||
-rw-r--r-- | drivers/accel/habanalabs/common/memory.c | 11 | ||||
-rw-r--r-- | drivers/accel/habanalabs/common/mmu/mmu.c | 8 | ||||
-rw-r--r-- | drivers/accel/habanalabs/common/pci/pci.c | 2 | ||||
-rw-r--r-- | drivers/accel/habanalabs/common/sysfs.c | 6 | ||||
-rw-r--r-- | drivers/accel/habanalabs/gaudi/gaudi.c | 86 | ||||
-rw-r--r-- | drivers/accel/habanalabs/gaudi/gaudiP.h | 15 | ||||
-rw-r--r-- | drivers/accel/habanalabs/gaudi2/gaudi2.c | 347 | ||||
-rw-r--r-- | drivers/accel/habanalabs/gaudi2/gaudi2P.h | 17 | ||||
-rw-r--r-- | drivers/accel/habanalabs/goya/goya.c | 1 | ||||
-rw-r--r-- | drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h | 4 | ||||
-rw-r--r-- | include/uapi/drm/habanalabs_accel.h | 3 |
17 files changed, 382 insertions, 269 deletions
diff --git a/drivers/accel/habanalabs/common/command_buffer.c b/drivers/accel/habanalabs/common/command_buffer.c index 3a0535ac28b1..6e09f48750a0 100644 --- a/drivers/accel/habanalabs/common/command_buffer.c +++ b/drivers/accel/habanalabs/common/command_buffer.c @@ -45,20 +45,29 @@ static int cb_map_mem(struct hl_ctx *ctx, struct hl_cb *cb) } mutex_lock(&hdev->mmu_lock); + rc = hl_mmu_map_contiguous(ctx, cb->virtual_addr, cb->bus_address, cb->roundup_size); if (rc) { dev_err(hdev->dev, "Failed to map VA %#llx to CB\n", cb->virtual_addr); - goto err_va_umap; + goto err_va_pool_free; } + rc = hl_mmu_invalidate_cache(hdev, false, MMU_OP_USERPTR | MMU_OP_SKIP_LOW_CACHE_INV); + if (rc) + goto err_mmu_unmap; + mutex_unlock(&hdev->mmu_lock); cb->is_mmu_mapped = true; - return rc; -err_va_umap: + return 0; + +err_mmu_unmap: + hl_mmu_unmap_contiguous(ctx, cb->virtual_addr, cb->roundup_size); +err_va_pool_free: mutex_unlock(&hdev->mmu_lock); gen_pool_free(ctx->cb_va_pool, cb->virtual_addr, cb->roundup_size); + return rc; } diff --git a/drivers/accel/habanalabs/common/decoder.c b/drivers/accel/habanalabs/common/decoder.c index 69c78c1784b4..c03a6da45d00 100644 --- a/drivers/accel/habanalabs/common/decoder.c +++ b/drivers/accel/habanalabs/common/decoder.c @@ -43,48 +43,46 @@ static void dec_print_abnrm_intr_source(struct hl_device *hdev, u32 irq_status) intr_source[2], intr_source[3], intr_source[4], intr_source[5]); } -static void dec_error_intr_work(struct hl_device *hdev, u32 base_addr, u32 core_id) +static void dec_abnrm_intr_work(struct work_struct *work) { + struct hl_dec *dec = container_of(work, struct hl_dec, abnrm_intr_work); + struct hl_device *hdev = dec->hdev; + u32 irq_status, event_mask = 0; bool reset_required = false; - u32 irq_status, event_mask; - irq_status = RREG32(base_addr + VCMD_IRQ_STATUS_OFFSET); + irq_status = RREG32(dec->base_addr + VCMD_IRQ_STATUS_OFFSET); - dev_err(hdev->dev, "Decoder abnormal interrupt %#x, core %d\n", irq_status, core_id); + dev_err(hdev->dev, "Decoder abnormal interrupt %#x, core %d\n", irq_status, dec->core_id); dec_print_abnrm_intr_source(hdev, irq_status); /* Clear the interrupt */ - WREG32(base_addr + VCMD_IRQ_STATUS_OFFSET, irq_status); + WREG32(dec->base_addr + VCMD_IRQ_STATUS_OFFSET, irq_status); /* Flush the interrupt clear */ - RREG32(base_addr + VCMD_IRQ_STATUS_OFFSET); + RREG32(dec->base_addr + VCMD_IRQ_STATUS_OFFSET); if (irq_status & VCMD_IRQ_STATUS_TIMEOUT_MASK) { reset_required = true; - event_mask = HL_NOTIFIER_EVENT_GENERAL_HW_ERR; - } else if (irq_status & VCMD_IRQ_STATUS_CMDERR_MASK) { - event_mask = HL_NOTIFIER_EVENT_UNDEFINED_OPCODE; - } else { - event_mask = HL_NOTIFIER_EVENT_USER_ENGINE_ERR; + event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; } + if (irq_status & VCMD_IRQ_STATUS_CMDERR_MASK) + event_mask |= HL_NOTIFIER_EVENT_UNDEFINED_OPCODE; + + if (irq_status & (VCMD_IRQ_STATUS_ENDCMD_MASK | + VCMD_IRQ_STATUS_BUSERR_MASK | + VCMD_IRQ_STATUS_ABORT_MASK)) + event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; + if (reset_required) { event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET; hl_device_cond_reset(hdev, 0, event_mask); - } else { + } else if (event_mask) { hl_notifier_event_send_all(hdev, event_mask); } } -static void dec_completion_abnrm(struct work_struct *work) -{ - struct hl_dec *dec = container_of(work, struct hl_dec, completion_abnrm_work); - struct hl_device *hdev = dec->hdev; - - dec_error_intr_work(hdev, dec->base_addr, dec->core_id); -} - void hl_dec_fini(struct hl_device *hdev) { kfree(hdev->dec); @@ -108,7 +106,7 @@ int hl_dec_init(struct hl_device *hdev) dec = hdev->dec + j; dec->hdev = hdev; - INIT_WORK(&dec->completion_abnrm_work, dec_completion_abnrm); + INIT_WORK(&dec->abnrm_intr_work, dec_abnrm_intr_work); dec->core_id = j; dec->base_addr = hdev->asic_funcs->get_dec_base_addr(hdev, j); if (!dec->base_addr) { diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c index 713005998cbc..fabfc501ef54 100644 --- a/drivers/accel/habanalabs/common/device.c +++ b/drivers/accel/habanalabs/common/device.c @@ -1271,7 +1271,6 @@ int hl_device_resume(struct hl_device *hdev) return 0; disable_device: - pci_clear_master(hdev->pdev); pci_disable_device(hdev->pdev); return rc; @@ -1381,6 +1380,34 @@ static void device_disable_open_processes(struct hl_device *hdev, bool control_d mutex_unlock(fd_lock); } +static void send_disable_pci_access(struct hl_device *hdev, u32 flags) +{ + /* If reset is due to heartbeat, device CPU is no responsive in + * which case no point sending PCI disable message to it. + */ + if ((flags & HL_DRV_RESET_HARD) && + !(flags & (HL_DRV_RESET_HEARTBEAT | HL_DRV_RESET_BYPASS_REQ_TO_FW))) { + /* Disable PCI access from device F/W so he won't send + * us additional interrupts. We disable MSI/MSI-X at + * the halt_engines function and we can't have the F/W + * sending us interrupts after that. We need to disable + * the access here because if the device is marked + * disable, the message won't be send. Also, in case + * of heartbeat, the device CPU is marked as disable + * so this message won't be sent + */ + if (hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0)) { + dev_warn(hdev->dev, "Failed to disable FW's PCI access\n"); + return; + } + + /* verify that last EQs are handled before disabled is set */ + if (hdev->cpu_queues_enable) + synchronize_irq(pci_irq_vector(hdev->pdev, + hdev->asic_prop.eq_interrupt_id)); + } +} + static void handle_reset_trigger(struct hl_device *hdev, u32 flags) { u32 cur_reset_trigger = HL_RESET_TRIGGER_DEFAULT; @@ -1419,28 +1446,6 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags) } else { hdev->reset_info.reset_trigger_repeated = 1; } - - /* If reset is due to heartbeat, device CPU is no responsive in - * which case no point sending PCI disable message to it. - * - * If F/W is performing the reset, no need to send it a message to disable - * PCI access - */ - if ((flags & HL_DRV_RESET_HARD) && - !(flags & (HL_DRV_RESET_HEARTBEAT | HL_DRV_RESET_BYPASS_REQ_TO_FW))) { - /* Disable PCI access from device F/W so he won't send - * us additional interrupts. We disable MSI/MSI-X at - * the halt_engines function and we can't have the F/W - * sending us interrupts after that. We need to disable - * the access here because if the device is marked - * disable, the message won't be send. Also, in case - * of heartbeat, the device CPU is marked as disable - * so this message won't be sent - */ - if (hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0)) - dev_warn(hdev->dev, - "Failed to disable FW's PCI access\n"); - } } /* @@ -1561,6 +1566,7 @@ do_reset: escalate_reset_flow: handle_reset_trigger(hdev, flags); + send_disable_pci_access(hdev, flags); /* This also blocks future CS/VM/JOB completion operations */ hdev->disabled = true; @@ -1823,9 +1829,7 @@ kill_processes: dev_info(hdev->dev, "Performing hard reset scheduled during compute reset\n"); flags = hdev->reset_info.hard_reset_schedule_flags; hdev->reset_info.hard_reset_schedule_flags = 0; - hdev->disabled = true; hard_reset = true; - handle_reset_trigger(hdev, flags); goto escalate_reset_flow; } } diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c index 7ea611392f8c..59f61ec66445 100644 --- a/drivers/accel/habanalabs/common/firmware_if.c +++ b/drivers/accel/habanalabs/common/firmware_if.c @@ -71,7 +71,7 @@ free_fw_ver: return NULL; } -static int extract_fw_sub_versions(struct hl_device *hdev, char *preboot_ver) +static int hl_get_preboot_major_minor(struct hl_device *hdev, char *preboot_ver) { char major[8], minor[8], *first_dot, *second_dot; int rc; @@ -86,7 +86,7 @@ static int extract_fw_sub_versions(struct hl_device *hdev, char *preboot_ver) if (rc) { dev_err(hdev->dev, "Error %d parsing preboot major version\n", rc); - goto out; + return rc; } /* skip the first dot */ @@ -102,9 +102,6 @@ static int extract_fw_sub_versions(struct hl_device *hdev, char *preboot_ver) if (rc) dev_err(hdev->dev, "Error %d parsing preboot minor version\n", rc); - -out: - kfree(preboot_ver); return rc; } @@ -1263,7 +1260,7 @@ void hl_fw_ask_hard_reset_without_linux(struct hl_device *hdev) COMMS_RST_DEV, 0, false, hdev->fw_loader.cpu_timeout); if (rc) - dev_warn(hdev->dev, "Failed sending COMMS_RST_DEV\n"); + dev_err(hdev->dev, "Failed sending COMMS_RST_DEV\n"); } else { WREG32(static_loader->kmd_msg_to_cpu_reg, KMD_MSG_RST_DEV); } @@ -1281,10 +1278,10 @@ void hl_fw_ask_halt_machine_without_linux(struct hl_device *hdev) /* Stop device CPU to make sure nothing bad happens */ if (hdev->asic_prop.dynamic_fw_load) { rc = hl_fw_dynamic_send_protocol_cmd(hdev, &hdev->fw_loader, - COMMS_GOTO_WFE, 0, true, + COMMS_GOTO_WFE, 0, false, hdev->fw_loader.cpu_timeout); if (rc) - dev_warn(hdev->dev, "Failed sending COMMS_GOTO_WFE\n"); + dev_err(hdev->dev, "Failed sending COMMS_GOTO_WFE\n"); } else { WREG32(static_loader->kmd_msg_to_cpu_reg, KMD_MSG_GOTO_WFE); msleep(static_loader->cpu_reset_wait_msec); @@ -2181,8 +2178,8 @@ static int hl_fw_dynamic_read_device_fw_version(struct hl_device *hdev, dev_info(hdev->dev, "preboot version %s\n", preboot_ver); - /* This function takes care of freeing preboot_ver */ - rc = extract_fw_sub_versions(hdev, preboot_ver); + rc = hl_get_preboot_major_minor(hdev, preboot_ver); + kfree(preboot_ver); if (rc) return rc; } diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h index a6f5c2152b0a..eaae69a9f817 100644 --- a/drivers/accel/habanalabs/common/habanalabs.h +++ b/drivers/accel/habanalabs/common/habanalabs.h @@ -662,7 +662,7 @@ struct hl_hints_range { * @user_interrupt_count: number of user interrupts. * @user_dec_intr_count: number of decoder interrupts exposed to user. * @tpc_interrupt_id: interrupt id for TPC to use in order to raise events towards the host. - * @unexpected_user_error_interrupt_id: interrupt id used to indicate an unexpected user error. + * @eq_interrupt_id: interrupt id for EQ, uses to synchronize EQ interrupts in hard-reset. * @cache_line_size: device cache line size. * @server_type: Server type that the ASIC is currently installed in. * The value is according to enum hl_server_type in uapi file. @@ -793,7 +793,7 @@ struct asic_fixed_properties { u16 user_interrupt_count; u16 user_dec_intr_count; u16 tpc_interrupt_id; - u16 unexpected_user_error_interrupt_id; + u16 eq_interrupt_id; u16 cache_line_size; u16 server_type; u8 completion_queues_count; @@ -1211,15 +1211,15 @@ struct hl_eq { /** * struct hl_dec - describes a decoder sw instance. * @hdev: pointer to the device structure. - * @completion_abnrm_work: workqueue object to run when decoder generates an error interrupt + * @abnrm_intr_work: workqueue work item to run when decoder generates an error interrupt. * @core_id: ID of the decoder. * @base_addr: base address of the decoder. */ struct hl_dec { - struct hl_device *hdev; - struct work_struct completion_abnrm_work; - u32 core_id; - u32 base_addr; + struct hl_device *hdev; + struct work_struct abnrm_intr_work; + u32 core_id; + u32 base_addr; }; /** diff --git a/drivers/accel/habanalabs/common/irq.c b/drivers/accel/habanalabs/common/irq.c index fab1abc5c910..c67895b1cdeb 100644 --- a/drivers/accel/habanalabs/common/irq.c +++ b/drivers/accel/habanalabs/common/irq.c @@ -415,8 +415,8 @@ irqreturn_t hl_irq_handler_eq(int irq, void *arg) struct hl_eq_entry *eq_base; struct hl_eqe_work *handle_eqe_work; bool entry_ready; - u32 cur_eqe; - u16 cur_eqe_index; + u32 cur_eqe, ctl; + u16 cur_eqe_index, event_type; eq_base = eq->kernel_address; @@ -449,7 +449,10 @@ irqreturn_t hl_irq_handler_eq(int irq, void *arg) dma_rmb(); if (hdev->disabled && !hdev->reset_info.in_compute_reset) { - dev_warn(hdev->dev, "Device disabled but received an EQ event\n"); + ctl = le32_to_cpu(eq_entry->hdr.ctl); + event_type = ((ctl & EQ_CTL_EVENT_TYPE_MASK) >> EQ_CTL_EVENT_TYPE_SHIFT); + dev_warn(hdev->dev, + "Device disabled but received an EQ event (%u)\n", event_type); goto skip_irq; } @@ -486,7 +489,7 @@ irqreturn_t hl_irq_handler_dec_abnrm(int irq, void *arg) { struct hl_dec *dec = arg; - schedule_work(&dec->completion_abnrm_work); + schedule_work(&dec->abnrm_intr_work); return IRQ_HANDLED; } diff --git a/drivers/accel/habanalabs/common/memory.c b/drivers/accel/habanalabs/common/memory.c index 17b79d717896..a7b6a273ce21 100644 --- a/drivers/accel/habanalabs/common/memory.c +++ b/drivers/accel/habanalabs/common/memory.c @@ -605,6 +605,7 @@ static u64 get_va_block(struct hl_device *hdev, bool is_align_pow_2 = is_power_of_2(va_range->page_size); bool is_hint_dram_addr = hl_is_dram_va(hdev, hint_addr); bool force_hint = flags & HL_MEM_FORCE_HINT; + int rc; if (is_align_pow_2) align_mask = ~((u64)va_block_align - 1); @@ -722,9 +723,13 @@ static u64 get_va_block(struct hl_device *hdev, kfree(new_va_block); } - if (add_prev) - add_va_block_locked(hdev, &va_range->list, prev_start, - prev_end); + if (add_prev) { + rc = add_va_block_locked(hdev, &va_range->list, prev_start, prev_end); + if (rc) { + reserved_valid_start = 0; + goto out; + } + } print_va_list_locked(hdev, &va_range->list); out: diff --git a/drivers/accel/habanalabs/common/mmu/mmu.c b/drivers/accel/habanalabs/common/mmu/mmu.c index 17581b1bcc77..f379e5b461a6 100644 --- a/drivers/accel/habanalabs/common/mmu/mmu.c +++ b/drivers/accel/habanalabs/common/mmu/mmu.c @@ -679,7 +679,9 @@ int hl_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard, u32 flags) rc = hdev->asic_funcs->mmu_invalidate_cache(hdev, is_hard, flags); if (rc) - dev_err_ratelimited(hdev->dev, "MMU cache invalidation failed\n"); + dev_err_ratelimited(hdev->dev, + "%s cache invalidation failed, rc=%d\n", + flags == VM_TYPE_USERPTR ? "PMMU" : "HMMU", rc); return rc; } @@ -692,7 +694,9 @@ int hl_mmu_invalidate_cache_range(struct hl_device *hdev, bool is_hard, rc = hdev->asic_funcs->mmu_invalidate_cache_range(hdev, is_hard, flags, asid, va, size); if (rc) - dev_err_ratelimited(hdev->dev, "MMU cache range invalidation failed\n"); + dev_err_ratelimited(hdev->dev, + "%s cache range invalidation failed: va=%#llx, size=%llu, rc=%d", + flags == VM_TYPE_USERPTR ? "PMMU" : "HMMU", va, size, rc); return rc; } diff --git a/drivers/accel/habanalabs/common/pci/pci.c b/drivers/accel/habanalabs/common/pci/pci.c index d1f4c695baf2..191e0e3cf3a5 100644 --- a/drivers/accel/habanalabs/common/pci/pci.c +++ b/drivers/accel/habanalabs/common/pci/pci.c @@ -420,7 +420,6 @@ int hl_pci_init(struct hl_device *hdev) unmap_pci_bars: hl_pci_bars_unmap(hdev); disable_device: - pci_clear_master(pdev); pci_disable_device(pdev); return rc; @@ -436,6 +435,5 @@ void hl_pci_fini(struct hl_device *hdev) { hl_pci_bars_unmap(hdev); - pci_clear_master(hdev->pdev); pci_disable_device(hdev->pdev); } diff --git a/drivers/accel/habanalabs/common/sysfs.c b/drivers/accel/habanalabs/common/sysfs.c index 735d8bed0066..01f89f029355 100644 --- a/drivers/accel/habanalabs/common/sysfs.c +++ b/drivers/accel/habanalabs/common/sysfs.c @@ -497,10 +497,14 @@ int hl_sysfs_init(struct hl_device *hdev) if (rc) { dev_err(hdev->dev, "Failed to add groups to device, error %d\n", rc); - return rc; + goto remove_groups; } return 0; + +remove_groups: + device_remove_groups(hdev->dev, hl_dev_attr_groups); + return rc; } void hl_sysfs_fini(struct hl_device *hdev) diff --git a/drivers/accel/habanalabs/gaudi/gaudi.c b/drivers/accel/habanalabs/gaudi/gaudi.c index 08a4b1cf2b42..a29aa8f7b6f3 100644 --- a/drivers/accel/habanalabs/gaudi/gaudi.c +++ b/drivers/accel/habanalabs/gaudi/gaudi.c @@ -682,6 +682,9 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev) prop->first_available_user_interrupt = USHRT_MAX; prop->tpc_interrupt_id = USHRT_MAX; + /* single msi */ + prop->eq_interrupt_id = 0; + for (i = 0 ; i < HL_MAX_DCORES ; i++) prop->first_available_cq[i] = USHRT_MAX; @@ -2017,38 +2020,6 @@ static int gaudi_enable_msi_single(struct hl_device *hdev) return rc; } -static int gaudi_enable_msi_multi(struct hl_device *hdev) -{ - int cq_cnt = hdev->asic_prop.completion_queues_count; - int rc, i, irq_cnt_init, irq; - - for (i = 0, irq_cnt_init = 0 ; i < cq_cnt ; i++, irq_cnt_init++) { - irq = gaudi_pci_irq_vector(hdev, i, false); - rc = request_irq(irq, hl_irq_handler_cq, 0, gaudi_irq_name[i], - &hdev->completion_queue[i]); - if (rc) { - dev_err(hdev->dev, "Failed to request IRQ %d", irq); - goto free_irqs; - } - } - - irq = gaudi_pci_irq_vector(hdev, GAUDI_EVENT_QUEUE_MSI_IDX, true); - rc = request_irq(irq, hl_irq_handler_eq, 0, gaudi_irq_name[cq_cnt], - &hdev->event_queue); - if (rc) { - dev_err(hdev->dev, "Failed to request IRQ %d", irq); - goto free_irqs; - } - - return 0; - -free_irqs: - for (i = 0 ; i < irq_cnt_init ; i++) - free_irq(gaudi_pci_irq_vector(hdev, i, false), - &hdev->completion_queue[i]); - return rc; -} - static int gaudi_enable_msi(struct hl_device *hdev) { struct gaudi_device *gaudi = hdev->asic_specific; @@ -2063,14 +2034,7 @@ static int gaudi_enable_msi(struct hl_device *hdev) return rc; } - if (rc < NUMBER_OF_INTERRUPTS) { - gaudi->multi_msi_mode = false; - rc = gaudi_enable_msi_single(hdev); - } else { - gaudi->multi_msi_mode = true; - rc = gaudi_enable_msi_multi(hdev); - } - + rc = gaudi_enable_msi_single(hdev); if (rc) goto free_pci_irq_vectors; @@ -2086,47 +2050,23 @@ free_pci_irq_vectors: static void gaudi_sync_irqs(struct hl_device *hdev) { struct gaudi_device *gaudi = hdev->asic_specific; - int i, cq_cnt = hdev->asic_prop.completion_queues_count; if (!(gaudi->hw_cap_initialized & HW_CAP_MSI)) return; /* Wait for all pending IRQs to be finished */ - if (gaudi->multi_msi_mode) { - for (i = 0 ; i < cq_cnt ; i++) - synchronize_irq(gaudi_pci_irq_vector(hdev, i, false)); - - synchronize_irq(gaudi_pci_irq_vector(hdev, - GAUDI_EVENT_QUEUE_MSI_IDX, - true)); - } else { - synchronize_irq(gaudi_pci_irq_vector(hdev, 0, false)); - } + synchronize_irq(gaudi_pci_irq_vector(hdev, 0, false)); } static void gaudi_disable_msi(struct hl_device *hdev) { struct gaudi_device *gaudi = hdev->asic_specific; - int i, irq, cq_cnt = hdev->asic_prop.completion_queues_count; if (!(gaudi->hw_cap_initialized & HW_CAP_MSI)) return; gaudi_sync_irqs(hdev); - - if (gaudi->multi_msi_mode) { - irq = gaudi_pci_irq_vector(hdev, GAUDI_EVENT_QUEUE_MSI_IDX, - true); - free_irq(irq, &hdev->event_queue); - - for (i = 0 ; i < cq_cnt ; i++) { - irq = gaudi_pci_irq_vector(hdev, i, false); - free_irq(irq, &hdev->completion_queue[i]); - } - } else { - free_irq(gaudi_pci_irq_vector(hdev, 0, false), hdev); - } - + free_irq(gaudi_pci_irq_vector(hdev, 0, false), hdev); pci_free_irq_vectors(hdev->pdev); gaudi->hw_cap_initialized &= ~HW_CAP_MSI; @@ -3921,11 +3861,7 @@ static int gaudi_init_cpu_queues(struct hl_device *hdev, u32 cpu_timeout) WREG32(mmCPU_IF_PF_PQ_PI, 0); - if (gaudi->multi_msi_mode) - WREG32(mmCPU_IF_QUEUE_INIT, PQ_INIT_STATUS_READY_FOR_CP); - else - WREG32(mmCPU_IF_QUEUE_INIT, - PQ_INIT_STATUS_READY_FOR_CP_SINGLE_MSI); + WREG32(mmCPU_IF_QUEUE_INIT, PQ_INIT_STATUS_READY_FOR_CP_SINGLE_MSI); irq_handler_offset = prop->gic_interrupts_enable ? mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR : @@ -5602,7 +5538,6 @@ static void gaudi_add_end_of_cb_packets(struct hl_device *hdev, void *kernel_add u32 len, u32 original_len, u64 cq_addr, u32 cq_val, u32 msi_vec, bool eb) { - struct gaudi_device *gaudi = hdev->asic_specific; struct packet_msg_prot *cq_pkt; struct packet_nop *cq_padding; u64 msi_addr; @@ -5632,12 +5567,7 @@ static void gaudi_add_end_of_cb_packets(struct hl_device *hdev, void *kernel_add tmp |= FIELD_PREP(GAUDI_PKT_CTL_MB_MASK, 1); cq_pkt->ctl = cpu_to_le32(tmp); cq_pkt->value = cpu_to_le32(1); - - if (gaudi->multi_msi_mode) - msi_addr = mmPCIE_MSI_INTR_0 + msi_vec * 4; - else - msi_addr = mmPCIE_CORE_MSI_REQ; - + msi_addr = hdev->pdev ? mmPCIE_CORE_MSI_REQ : mmPCIE_MSI_INTR_0 + msi_vec * 4; cq_pkt->addr = cpu_to_le64(CFG_BASE + msi_addr); } diff --git a/drivers/accel/habanalabs/gaudi/gaudiP.h b/drivers/accel/habanalabs/gaudi/gaudiP.h index 3d88d56c8eb3..b8fa724be5a1 100644 --- a/drivers/accel/habanalabs/gaudi/gaudiP.h +++ b/drivers/accel/habanalabs/gaudi/gaudiP.h @@ -28,20 +28,8 @@ #define NUMBER_OF_COLLECTIVE_QUEUES 12 #define NUMBER_OF_SOBS_IN_GRP 11 -/* - * Number of MSI interrupts IDS: - * Each completion queue has 1 ID - * The event queue has 1 ID - */ -#define NUMBER_OF_INTERRUPTS (NUMBER_OF_CMPLT_QUEUES + \ - NUMBER_OF_CPU_HW_QUEUES) - #define GAUDI_STREAM_MASTER_ARR_SIZE 8 -#if (NUMBER_OF_INTERRUPTS > GAUDI_MSI_ENTRIES) -#error "Number of MSI interrupts must be smaller or equal to GAUDI_MSI_ENTRIES" -#endif - #define CORESIGHT_TIMEOUT_USEC 100000 /* 100 ms */ #define GAUDI_MAX_CLK_FREQ 2200000000ull /* 2200 MHz */ @@ -324,8 +312,6 @@ struct gaudi_internal_qman_info { * signal we can use this engine in later code paths. * Each bit is cleared upon reset of its corresponding H/W * engine. - * @multi_msi_mode: whether we are working in multi MSI single MSI mode. - * Multi MSI is possible only with IOMMU enabled. * @mmu_cache_inv_pi: PI for MMU cache invalidation flow. The H/W expects an * 8-bit value so use u8. */ @@ -345,7 +331,6 @@ struct gaudi_device { u32 events_stat[GAUDI_EVENT_SIZE]; u32 events_stat_aggregate[GAUDI_EVENT_SIZE]; u32 hw_cap_initialized; - u8 multi_msi_mode; u8 mmu_cache_inv_pi; }; diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index edcbda3d9b40..b778cf764a68 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -2112,6 +2112,7 @@ static bool gaudi2_get_mme_idle_status(struct hl_device *hdev, u64 *mask_arr, u8 static bool gaudi2_get_edma_idle_status(struct hl_device *hdev, u64 *mask_arr, u8 mask_len, struct engines_data *e); static u64 gaudi2_mmu_scramble_addr(struct hl_device *hdev, u64 raw_addr); +static u64 gaudi2_mmu_descramble_addr(struct hl_device *hdev, u64 scrambled_addr); static void gaudi2_init_scrambler_hbm(struct hl_device *hdev) { @@ -2438,7 +2439,7 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev) prop->first_available_user_interrupt = GAUDI2_IRQ_NUM_USER_FIRST; prop->tpc_interrupt_id = GAUDI2_IRQ_NUM_TPC_ASSERT; - prop->unexpected_user_error_interrupt_id = GAUDI2_IRQ_NUM_UNEXPECTED_ERROR; + prop->eq_interrupt_id = GAUDI2_IRQ_NUM_EVENT_QUEUE; prop->first_available_cq[0] = GAUDI2_RESERVED_CQ_NUMBER; @@ -2887,6 +2888,10 @@ static int gaudi2_cpucp_info_get(struct hl_device *hdev) hdev->tpc_binning = le64_to_cpu(prop->cpucp_info.tpc_binning_mask); hdev->decoder_binning = lower_32_bits(le64_to_cpu(prop->cpucp_info.decoder_binning_mask)); + dev_dbg(hdev->dev, "Read binning masks: tpc: 0x%llx, dram: 0x%llx, edma: 0x%x, dec: 0x%x\n", + hdev->tpc_binning, hdev->dram_binning, hdev->edma_binning, + hdev->decoder_binning); + /* * at this point the DRAM parameters need to be updated according to data obtained * from the FW @@ -3345,7 +3350,7 @@ static void gaudi2_user_interrupt_setup(struct hl_device *hdev) /* Initialize TPC interrupt */ HL_USR_INTR_STRUCT_INIT(hdev->tpc_interrupt, hdev, 0, HL_USR_INTERRUPT_TPC); - /* Initialize general purpose interrupt */ + /* Initialize unexpected error interrupt */ HL_USR_INTR_STRUCT_INIT(hdev->unexpected_error_interrupt, hdev, 0, HL_USR_INTERRUPT_UNEXPECTED); @@ -3475,6 +3480,48 @@ static int gaudi2_special_blocks_iterator_config(struct hl_device *hdev) return gaudi2_special_blocks_config(hdev); } +static void gaudi2_test_queues_msgs_free(struct hl_device *hdev) +{ + struct gaudi2_device *gaudi2 = hdev->asic_specific; + struct gaudi2_queues_test_info *msg_info = gaudi2->queues_test_info; + int i; + + for (i = 0 ; i < GAUDI2_NUM_TESTED_QS ; i++) { + /* bail-out if this is an allocation failure point */ + if (!msg_info[i].kern_addr) + break; + + hl_asic_dma_pool_free(hdev, msg_info[i].kern_addr, msg_info[i].dma_addr); + msg_info[i].kern_addr = NULL; + } +} + +static int gaudi2_test_queues_msgs_alloc(struct hl_device *hdev) +{ + struct gaudi2_device *gaudi2 = hdev->asic_specific; + struct gaudi2_queues_test_info *msg_info = gaudi2->queues_test_info; + int i, rc; + + /* allocate a message-short buf for each Q we intend to test */ + for (i = 0 ; i < GAUDI2_NUM_TESTED_QS ; i++) { + msg_info[i].kern_addr = + (void *)hl_asic_dma_pool_zalloc(hdev, sizeof(struct packet_msg_short), + GFP_KERNEL, &msg_info[i].dma_addr); + if (!msg_info[i].kern_addr) { + dev_err(hdev->dev, + "Failed to allocate dma memory for H/W queue %d testing\n", i); + rc = -ENOMEM; + goto err_exit; + } + } + + return 0; + +err_exit: + gaudi2_test_queues_msgs_free(hdev); + return rc; +} + static int gaudi2_sw_init(struct hl_device *hdev) { struct asic_fixed_properties *prop = &hdev->asic_prop; @@ -3574,8 +3621,14 @@ static int gaudi2_sw_init(struct hl_device *hdev) if (rc) goto free_scratchpad_mem; + rc = gaudi2_test_queues_msgs_alloc(hdev); + if (rc) + goto special_blocks_free; + return 0; +special_blocks_free: + gaudi2_special_blocks_iterator_free(hdev); free_scratchpad_mem: hl_asic_dma_pool_free(hdev, gaudi2->scratchpad_kernel_address, gaudi2->scratchpad_bus_address); @@ -3598,6 +3651,8 @@ static int gaudi2_sw_fini(struct hl_device *hdev) struct asic_fixed_properties *prop = &hdev->asic_prop; struct gaudi2_device *gaudi2 = hdev->asic_specific; + gaudi2_test_queues_msgs_free(hdev); + gaudi2_special_blocks_iterator_free(hdev); hl_cpu_accessible_dma_pool_free(hdev, prop->pmmu.page_size, gaudi2->virt_msix_db_cpu_addr); @@ -4009,7 +4064,7 @@ static const char *gaudi2_irq_name(u16 irq_number) case GAUDI2_IRQ_NUM_TPC_ASSERT: return "gaudi2 tpc assert"; case GAUDI2_IRQ_NUM_UNEXPECTED_ERROR: - return "gaudi2 tpc assert"; + return "gaudi2 unexpected error"; case GAUDI2_IRQ_NUM_USER_FIRST ... GAUDI2_IRQ_NUM_USER_LAST: return "gaudi2 user completion"; default: @@ -6792,28 +6847,29 @@ static void gaudi2_qman_set_test_mode(struct hl_device *hdev, u32 hw_queue_id, b } } -static int gaudi2_test_queue(struct hl_device *hdev, u32 hw_queue_id) +static inline u32 gaudi2_test_queue_hw_queue_id_to_sob_id(struct hl_device *hdev, u32 hw_queue_id) { - u32 sob_offset = hdev->asic_prop.first_available_user_sob[0] * 4; + return hdev->asic_prop.first_available_user_sob[0] + + hw_queue_id - GAUDI2_QUEUE_ID_PDMA_0_0; +} + +static void gaudi2_test_queue_clear(struct hl_device *hdev, u32 hw_queue_id) +{ + u32 sob_offset = gaudi2_test_queue_hw_queue_id_to_sob_id(hdev, hw_queue_id) * 4; u32 sob_addr = mmDCORE0_SYNC_MNGR_OBJS_SOB_OBJ_0 + sob_offset; - u32 timeout_usec, tmp, sob_base = 1, sob_val = 0x5a5a; - struct packet_msg_short *msg_short_pkt; - dma_addr_t pkt_dma_addr; - size_t pkt_size; - int rc; - if (hdev->pldm) - timeout_usec = GAUDI2_PLDM_TEST_QUEUE_WAIT_USEC; - else - timeout_usec = GAUDI2_TEST_QUEUE_WAIT_USEC; + /* Reset the SOB value */ + WREG32(sob_addr, 0); +} - pkt_size = sizeof(*msg_short_pkt); - msg_short_pkt = hl_asic_dma_pool_zalloc(hdev, pkt_size, GFP_KERNEL, &pkt_dma_addr); - if (!msg_short_pkt) { - dev_err(hdev->dev, "Failed to allocate packet for H/W queue %d testing\n", - hw_queue_id); - return -ENOMEM; - } +static int gaudi2_test_queue_send_msg_short(struct hl_device *hdev, u32 hw_queue_id, u32 sob_val, + struct gaudi2_queues_test_info *msg_info) +{ + u32 sob_offset = gaudi2_test_queue_hw_queue_id_to_sob_id(hdev, hw_queue_id) * 4; + u32 tmp, sob_base = 1; + struct packet_msg_short *msg_short_pkt = msg_info->kern_addr; + size_t pkt_size = sizeof(struct packet_msg_short); + int rc; tmp = (PACKET_MSG_SHORT << GAUDI2_PKT_CTL_OPCODE_SHIFT) | (1 << GAUDI2_PKT_CTL_EB_SHIFT) | @@ -6824,15 +6880,25 @@ static int gaudi2_test_queue(struct hl_device *hdev, u32 hw_queue_id) msg_short_pkt->value = cpu_to_le32(sob_val); msg_short_pkt->ctl = cpu_to_le32(tmp); - /* Reset the SOB value */ - WREG32(sob_addr, 0); + rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id, pkt_size, msg_info->dma_addr); + if (rc) + dev_err(hdev->dev, + "Failed to send msg_short packet to H/W queue %d\n", hw_queue_id); - rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id, pkt_size, pkt_dma_addr); - if (rc) { - dev_err(hdev->dev, "Failed to send msg_short packet to H/W queue %d\n", - hw_queue_id); - goto free_pkt; - } + return rc; +} + +static int gaudi2_test_queue_wait_completion(struct hl_device *hdev, u32 hw_queue_id, u32 sob_val) +{ + u32 sob_offset = gaudi2_test_queue_hw_queue_id_to_sob_id(hdev, hw_queue_id) * 4; + u32 sob_addr = mmDCORE0_SYNC_MNGR_OBJS_SOB_OBJ_0 + sob_offset; + u32 timeout_usec, tmp; + int rc; + + if (hdev->pldm) + timeout_usec = GAUDI2_PLDM_TEST_QUEUE_WAIT_USEC; + else + timeout_usec = GAUDI2_TEST_QUEUE_WAIT_USEC; rc = hl_poll_timeout( hdev, @@ -6848,11 +6914,6 @@ static int gaudi2_test_queue(struct hl_device *hdev, u32 hw_queue_id) rc = -EIO; } - /* Reset the SOB value */ - WREG32(sob_addr, 0); - -free_pkt: - hl_asic_dma_pool_free(hdev, (void *) msg_short_pkt, pkt_dma_addr); return rc; } @@ -6872,30 +6933,44 @@ static int gaudi2_test_cpu_queue(struct hl_device *hdev) static int gaudi2_test_queues(struct hl_device *hdev) { - int i, rc, ret_val = 0; + struct gaudi2_device *gaudi2 = hdev->asic_specific; + struct gaudi2_queues_test_info *msg_info; + u32 sob_val = 0x5a5a; + int i, rc; + /* send test message on all enabled Qs */ for (i = GAUDI2_QUEUE_ID_PDMA_0_0 ; i < GAUDI2_QUEUE_ID_CPU_PQ; i++) { if (!gaudi2_is_queue_enabled(hdev, i)) continue; + msg_info = &gaudi2->queues_test_info[i - GAUDI2_QUEUE_ID_PDMA_0_0]; gaudi2_qman_set_test_mode(hdev, i, true); - rc = gaudi2_test_queue(hdev, i); - gaudi2_qman_set_test_mode(hdev, i, false); - - if (rc) { - ret_val = -EINVAL; + gaudi2_test_queue_clear(hdev, i); + rc = gaudi2_test_queue_send_msg_short(hdev, i, sob_val, msg_info); + if (rc) goto done; - } } rc = gaudi2_test_cpu_queue(hdev); - if (rc) { - ret_val = -EINVAL; + if (rc) goto done; + + /* verify that all messages were processed */ + for (i = GAUDI2_QUEUE_ID_PDMA_0_0 ; i < GAUDI2_QUEUE_ID_CPU_PQ; i++) { + if (!gaudi2_is_queue_enabled(hdev, i)) + continue; + + rc = gaudi2_test_queue_wait_completion(hdev, i, sob_val); + if (rc) + /* chip is not usable, no need for cleanups, just bail-out with error */ + goto done; + + gaudi2_test_queue_clear(hdev, i); + gaudi2_qman_set_test_mode(hdev, i, false); } done: - return ret_val; + return rc; } static int gaudi2_compute_reset_late_init(struct hl_device *hdev) @@ -8485,23 +8560,28 @@ static int gaudi2_handle_qman_err(struct hl_device *hdev, u16 event_type, u64 *e static int gaudi2_handle_arc_farm_sei_err(struct hl_device *hdev, u16 event_type) { - u32 i, sts_val, sts_clr_val = 0, error_count = 0; + u32 i, sts_val, sts_clr_val, error_count = 0, arc_farm; - sts_val = RREG32(mmARC_FARM_ARC0_AUX_ARC_SEI_INTR_STS); + for (arc_farm = 0 ; arc_farm < NUM_OF_ARC_FARMS_ARC ; arc_farm++) { + sts_clr_val = 0; + sts_val = RREG32(mmARC_FARM_ARC0_AUX_ARC_SEI_INTR_STS + + (arc_farm * ARC_FARM_OFFSET)); - for (i = 0 ; i < GAUDI2_NUM_OF_ARC_SEI_ERR_CAUSE ; i++) { - if (sts_val & BIT(i)) { - gaudi2_print_event(hdev, event_type, true, - "err cause: %s", gaudi2_arc_sei_error_cause[i]); - sts_clr_val |= BIT(i); - error_count++; + for (i = 0 ; i < GAUDI2_NUM_OF_ARC_SEI_ERR_CAUSE ; i++) { + if (sts_val & BIT(i)) { + gaudi2_print_event(hdev, event_type, true, + "ARC FARM ARC %u err cause: %s", + arc_farm, gaudi2_arc_sei_error_cause[i]); + sts_clr_val |= BIT(i); + error_count++; + } } + WREG32(mmARC_FARM_ARC0_AUX_ARC_SEI_INTR_CLR + (arc_farm * ARC_FARM_OFFSET), + sts_clr_val); } hl_check_for_glbl_errors(hdev); - WREG32(mmARC_FARM_ARC0_AUX_ARC_SEI_INTR_CLR, sts_clr_val); - return error_count; } @@ -8844,7 +8924,7 @@ static int gaudi2_handle_hif_fatal(struct hl_device *hdev, u16 event_type, u64 i static void gaudi2_handle_page_error(struct hl_device *hdev, u64 mmu_base, bool is_pmmu, u64 *event_mask) { - u32 valid, val, axid_l, axid_h; + u32 valid, val; u64 addr; valid = RREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_ACCESS_PAGE_ERROR_VALID)); @@ -8857,11 +8937,11 @@ static void gaudi2_handle_page_error(struct hl_device *hdev, u64 mmu_base, bool addr <<= 32; addr |= RREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_PAGE_ERROR_CAPTURE_VA)); - axid_l = RREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_PAGE_FAULT_ID_LSB)); - axid_h = RREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_PAGE_FAULT_ID_MSB)); + if (!is_pmmu) + addr = gaudi2_mmu_descramble_addr(hdev, addr); - dev_err_ratelimited(hdev->dev, "%s page fault on va 0x%llx, transaction id 0x%llX\n", - is_pmmu ? "PMMU" : "HMMU", addr, ((u64)axid_h << 32) + axid_l); + dev_err_ratelimited(hdev->dev, "%s page fault on va 0x%llx\n", + is_pmmu ? "PMMU" : "HMMU", addr); hl_handle_page_fault(hdev, addr, 0, is_pmmu, event_mask); WREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_ACCESS_PAGE_ERROR_VALID), 0); @@ -8882,9 +8962,12 @@ static void gaudi2_handle_access_error(struct hl_device *hdev, u64 mmu_base, boo addr <<= 32; addr |= RREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_ACCESS_ERROR_CAPTURE_VA)); + if (!is_pmmu) + addr = gaudi2_mmu_descramble_addr(hdev, addr); + dev_err_ratelimited(hdev->dev, "%s access error on va 0x%llx\n", is_pmmu ? "PMMU" : "HMMU", addr); - WREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_ACCESS_ERROR_CAPTURE), 0); + WREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_ACCESS_PAGE_ERROR_VALID), 0); } static int gaudi2_handle_mmu_spi_sei_generic(struct hl_device *hdev, u16 event_type, @@ -8976,46 +9059,110 @@ static int gaudi2_handle_sm_err(struct hl_device *hdev, u16 event_type, u8 sm_in return error_count; } +static u64 get_hmmu_base(u16 event_type) +{ + u8 dcore, index_in_dcore; + + switch (event_type) { + case GAUDI2_EVENT_HMMU_0_AXI_ERR_RSP: + case GAUDI2_EVENT_HMMU0_SPI_BASE ... GAUDI2_EVENT_HMMU0_SECURITY_ERROR: + dcore = 0; + index_in_dcore = 0; + break; + case GAUDI2_EVENT_HMMU_1_AXI_ERR_RSP: + case GAUDI2_EVENT_HMMU1_SPI_BASE ... GAUDI2_EVENT_HMMU1_SECURITY_ERROR: + dcore = 1; + index_in_dcore = 0; + break; + case GAUDI2_EVENT_HMMU_2_AXI_ERR_RSP: + case GAUDI2_EVENT_HMMU2_SPI_BASE ... GAUDI2_EVENT_HMMU2_SECURITY_ERROR: + dcore = 0; + index_in_dcore = 1; + break; + case GAUDI2_EVENT_HMMU_3_AXI_ERR_RSP: + case GAUDI2_EVENT_HMMU3_SPI_BASE ... GAUDI2_EVENT_HMMU3_SECURITY_ERROR: + dcore = 1; + index_in_dcore = 1; + break; + case GAUDI2_EVENT_HMMU_4_AXI_ERR_RSP: + case GAUDI2_EVENT_HMMU4_SPI_BASE ... GAUDI2_EVENT_HMMU4_SECURITY_ERROR: + dcore = 3; + index_in_dcore = 2; + break; + case GAUDI2_EVENT_HMMU_5_AXI_ERR_RSP: + case GAUDI2_EVENT_HMMU5_SPI_BASE ... GAUDI2_EVENT_HMMU5_SECURITY_ERROR: + dcore = 2; + index_in_dcore = 2; + break; + case GAUDI2_EVENT_HMMU_6_AXI_ERR_RSP: + case GAUDI2_EVENT_HMMU6_SPI_BASE ... GAUDI2_EVENT_HMMU6_SECURITY_ERROR: + dcore = 3; + index_in_dcore = 3; + break; + case GAUDI2_EVENT_HMMU_7_AXI_ERR_RSP: + case GAUDI2_EVENT_HMMU7_SPI_BASE ... GAUDI2_EVENT_HMMU7_SECURITY_ERROR: + dcore = 2; + index_in_dcore = 3; + break; + case GAUDI2_EVENT_HMMU_8_AXI_ERR_RSP: + case GAUDI2_EVENT_HMMU8_SPI_BASE ... GAUDI2_EVENT_HMMU8_SECURITY_ERROR: + dcore = 0; + index_in_dcore = 2; + break; + case GAUDI2_EVENT_HMMU_9_AXI_ERR_RSP: + case GAUDI2_EVENT_HMMU9_SPI_BASE ... GAUDI2_EVENT_HMMU9_SECURITY_ERROR: + dcore = 1; + index_in_dcore = 2; + break; + case GAUDI2_EVENT_HMMU_10_AXI_ERR_RSP: + case GAUDI2_EVENT_HMMU10_SPI_BASE ... GAUDI2_EVENT_HMMU10_SECURITY_ERROR: + dcore = 0; + index_in_dcore = 3; + break; + case GAUDI2_EVENT_HMMU_11_AXI_ERR_RSP: + case GAUDI2_EVENT_HMMU11_SPI_BASE ... GAUDI2_EVENT_HMMU11_SECURITY_ERROR: + dcore = 1; + index_in_dcore = 3; + break; + case GAUDI2_EVENT_HMMU_12_AXI_ERR_RSP: + case GAUDI2_EVENT_HMMU12_SPI_BASE ... GAUDI2_EVENT_HMMU12_SECURITY_ERROR: + dcore = 3; + index_in_dcore = 0; + break; + case GAUDI2_EVENT_HMMU_13_AXI_ERR_RSP: + case GAUDI2_EVENT_HMMU13_SPI_BASE ... GAUDI2_EVENT_HMMU13_SECURITY_ERROR: + dcore = 2; + index_in_dcore = 0; + break; + case GAUDI2_EVENT_HMMU_14_AXI_ERR_RSP: + case GAUDI2_EVENT_HMMU14_SPI_BASE ... GAUDI2_EVENT_HMMU14_SECURITY_ERROR: + dcore = 3; + index_in_dcore = 1; + break; + case GAUDI2_EVENT_HMMU_15_AXI_ERR_RSP: + case GAUDI2_EVENT_HMMU15_SPI_BASE ... GAUDI2_EVENT_HMMU15_SECURITY_ERROR: + dcore = 2; + index_in_dcore = 1; + break; + default: + return ULONG_MAX; + } + + return mmDCORE0_HMMU0_MMU_BASE + dcore * DCORE_OFFSET + index_in_dcore * DCORE_HMMU_OFFSET; +} + static int gaudi2_handle_mmu_spi_sei_err(struct hl_device *hdev, u16 event_type, u64 *event_mask) { bool is_pmmu = false; u32 error_count = 0; u64 mmu_base; - u8 index; switch (event_type) { - case GAUDI2_EVENT_HMMU0_PAGE_FAULT_OR_WR_PERM ... GAUDI2_EVENT_HMMU3_SECURITY_ERROR: - index = (event_type - GAUDI2_EVENT_HMMU0_PAGE_FAULT_OR_WR_PERM) / 3; - mmu_base = mmDCORE0_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET; - break; - case GAUDI2_EVENT_HMMU_0_AXI_ERR_RSP ... GAUDI2_EVENT_HMMU_3_AXI_ERR_RSP: - index = (event_type - GAUDI2_EVENT_HMMU_0_AXI_ERR_RSP); - mmu_base = mmDCORE0_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET; - break; - case GAUDI2_EVENT_HMMU8_PAGE_FAULT_WR_PERM ... GAUDI2_EVENT_HMMU11_SECURITY_ERROR: - index = (event_type - GAUDI2_EVENT_HMMU8_PAGE_FAULT_WR_PERM) / 3; - mmu_base = mmDCORE1_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET; - break; - case GAUDI2_EVENT_HMMU_8_AXI_ERR_RSP ... GAUDI2_EVENT_HMMU_11_AXI_ERR_RSP: - index = (event_type - GAUDI2_EVENT_HMMU_8_AXI_ERR_RSP); - mmu_base = mmDCORE1_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET; - break; - case GAUDI2_EVENT_HMMU7_PAGE_FAULT_WR_PERM ... GAUDI2_EVENT_HMMU4_SECURITY_ERROR: - index = (event_type - GAUDI2_EVENT_HMMU7_PAGE_FAULT_WR_PERM) / 3; - mmu_base = mmDCORE2_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET; - break; - case GAUDI2_EVENT_HMMU_7_AXI_ERR_RSP ... GAUDI2_EVENT_HMMU_4_AXI_ERR_RSP: - index = (event_type - GAUDI2_EVENT_HMMU_7_AXI_ERR_RSP); - mmu_base = mmDCORE2_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET; - break; - case GAUDI2_EVENT_HMMU15_PAGE_FAULT_WR_PERM ... GAUDI2_EVENT_HMMU12_SECURITY_ERROR: - index = (event_type - GAUDI2_EVENT_HMMU15_PAGE_FAULT_WR_PERM) / 3; - mmu_base = mmDCORE3_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET; - break; - case GAUDI2_EVENT_HMMU_15_AXI_ERR_RSP ... GAUDI2_EVENT_HMMU_12_AXI_ERR_RSP: - index = (event_type - GAUDI2_EVENT_HMMU_15_AXI_ERR_RSP); - mmu_base = mmDCORE3_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET; + case GAUDI2_EVENT_HMMU_0_AXI_ERR_RSP ... GAUDI2_EVENT_HMMU_12_AXI_ERR_RSP: + case GAUDI2_EVENT_HMMU0_SPI_BASE ... GAUDI2_EVENT_HMMU12_SECURITY_ERROR: + mmu_base = get_hmmu_base(event_type); break; + case GAUDI2_EVENT_PMMU0_PAGE_FAULT_WR_PERM ... GAUDI2_EVENT_PMMU0_SECURITY_ERROR: case GAUDI2_EVENT_PMMU_AXI_ERR_RSP_0: is_pmmu = true; @@ -9025,6 +9172,9 @@ static int gaudi2_handle_mmu_spi_sei_err(struct hl_device *hdev, u16 event_type, return 0; } + if (mmu_base == ULONG_MAX) + return 0; + error_count = gaudi2_handle_mmu_spi_sei_generic(hdev, event_type, mmu_base, is_pmmu, event_mask); hl_check_for_glbl_errors(hdev); @@ -9435,19 +9585,18 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent break; case GAUDI2_EVENT_ARC_AXI_ERROR_RESPONSE_0: - reset_flags |= HL_DRV_RESET_FW_FATAL_ERR; error_count = gaudi2_handle_arc_farm_sei_err(hdev, event_type); - event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; + event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; case GAUDI2_EVENT_CPU_AXI_ERR_RSP: error_count = gaudi2_handle_cpu_sei_err(hdev, event_type); - event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; + reset_flags |= HL_DRV_RESET_FW_FATAL_ERR; + event_mask |= HL_NOTIFIER_EVENT_CRITICL_FW_ERR; break; case GAUDI2_EVENT_PDMA_CH0_AXI_ERR_RSP: case GAUDI2_EVENT_PDMA_CH1_AXI_ERR_RSP: - reset_flags |= HL_DRV_RESET_FW_FATAL_ERR; error_count = gaudi2_handle_qm_sei_err(hdev, event_type, true, &event_mask); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; @@ -9634,12 +9783,14 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent case GAUDI2_EVENT_PCIE_DRAIN_COMPLETE: error_count = gaudi2_handle_pcie_drain(hdev, &eq_entry->pcie_drain_ind_data); + reset_flags |= HL_DRV_RESET_FW_FATAL_ERR; event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; break; case GAUDI2_EVENT_PSOC59_RPM_ERROR_OR_DRAIN: error_count = gaudi2_handle_psoc_drain(hdev, le64_to_cpu(eq_entry->intr_cause.intr_cause_data)); + reset_flags |= HL_DRV_RESET_FW_FATAL_ERR; event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; break; @@ -9668,6 +9819,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent break; case GAUDI2_EVENT_PSOC_AXI_ERR_RSP: error_count = GAUDI2_NA_EVENT_CAUSE; + reset_flags |= HL_DRV_RESET_FW_FATAL_ERR; event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; break; case GAUDI2_EVENT_PSOC_PRSTN_FALL: @@ -9681,6 +9833,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent break; case GAUDI2_EVENT_PCIE_FATAL_ERR: error_count = GAUDI2_NA_EVENT_CAUSE; + reset_flags |= HL_DRV_RESET_FW_FATAL_ERR; event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; break; case GAUDI2_EVENT_TPC0_BMON_SPMU: @@ -9748,6 +9901,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent case GAUDI2_EVENT_CPU_PKT_QUEUE_OUT_SYNC: gaudi2_print_out_of_sync_info(hdev, event_type, &eq_entry->pkt_sync_err); error_count = GAUDI2_NA_EVENT_CAUSE; + reset_flags |= HL_DRV_RESET_FW_FATAL_ERR; event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; break; @@ -9789,6 +9943,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent case GAUDI2_EVENT_CPU_PKT_SANITY_FAILED: gaudi2_print_cpu_pkt_failure_info(hdev, event_type, &eq_entry->pkt_sync_err); error_count = GAUDI2_NA_EVENT_CAUSE; + reset_flags |= HL_DRV_RESET_FW_FATAL_ERR; event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; break; diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2P.h b/drivers/accel/habanalabs/gaudi2/gaudi2P.h index 0742046810f9..1cebe707772e 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2P.h +++ b/drivers/accel/habanalabs/gaudi2/gaudi2P.h @@ -240,6 +240,8 @@ #define GAUDI2_SOB_INCREMENT_BY_ONE (FIELD_PREP(DCORE0_SYNC_MNGR_OBJS_SOB_OBJ_VAL_MASK, 1) | \ FIELD_PREP(DCORE0_SYNC_MNGR_OBJS_SOB_OBJ_INC_MASK, 1)) +#define GAUDI2_NUM_TESTED_QS (GAUDI2_QUEUE_ID_CPU_PQ - GAUDI2_QUEUE_ID_PDMA_0_0) + #define GAUDI2_NUM_OF_GLBL_ERR_CAUSE 8 enum gaudi2_reserved_sob_id { @@ -453,6 +455,17 @@ struct dup_block_ctx { }; /** + * struct gaudi2_queues_test_info - Holds the address of a the messages used for testing the + * device queues. + * @dma_addr: the address used by the HW for accessing the message. + * @kern_addr: The address used by the driver for accessing the message. + */ +struct gaudi2_queues_test_info { + dma_addr_t dma_addr; + void *kern_addr; +}; + +/** * struct gaudi2_device - ASIC specific manage structure. * @cpucp_info_get: get information on device from CPU-CP * @mapped_blocks: array that holds the base address and size of all blocks @@ -510,6 +523,7 @@ struct dup_block_ctx { * @flush_db_fifo: flag to force flush DB FIFO after a write. * @hbm_cfg: HBM subsystem settings * @hw_queues_lock_mutex: used by simulator instead of hw_queues_lock. + * @queues_test_info: information used by the driver when testing the HW queues. */ struct gaudi2_device { int (*cpucp_info_get)(struct hl_device *hdev); @@ -537,6 +551,9 @@ struct gaudi2_device { u32 events_stat[GAUDI2_EVENT_SIZE]; u32 events_stat_aggregate[GAUDI2_EVENT_SIZE]; u32 num_of_valid_hw_events; + + /* Queue testing */ + struct gaudi2_queues_test_info queues_test_info[GAUDI2_NUM_TESTED_QS]; }; /* diff --git a/drivers/accel/habanalabs/goya/goya.c b/drivers/accel/habanalabs/goya/goya.c index 07d67878eac5..fb0ac9df841a 100644 --- a/drivers/accel/habanalabs/goya/goya.c +++ b/drivers/accel/habanalabs/goya/goya.c @@ -473,6 +473,7 @@ int goya_set_fixed_properties(struct hl_device *hdev) prop->first_available_user_interrupt = USHRT_MAX; prop->tpc_interrupt_id = USHRT_MAX; + prop->eq_interrupt_id = GOYA_EVENT_QUEUE_MSIX_IDX; for (i = 0 ; i < HL_MAX_DCORES ; i++) prop->first_available_cq[i] = USHRT_MAX; diff --git a/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h b/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h index 452b379f39f6..6c58af614236 100644 --- a/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h +++ b/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 * - * Copyright 2020-2022 HabanaLabs, Ltd. + * Copyright 2020-2023 HabanaLabs, Ltd. * All Rights Reserved. * */ @@ -543,6 +543,8 @@ #define HBM_MC_SPI_IEEE1500_COMP_MASK BIT(3) #define HBM_MC_SPI_IEEE1500_PAUSED_MASK BIT(4) +#define ARC_FARM_OFFSET (mmARC_FARM_ARC1_AUX_BASE - mmARC_FARM_ARC0_AUX_BASE) + #include "nic0_qpc0_regs.h" #include "nic0_qm0_regs.h" #include "nic0_qm_arc_aux0_regs.h" diff --git a/include/uapi/drm/habanalabs_accel.h b/include/uapi/drm/habanalabs_accel.h index c139aab17c8a..d9ef1b151d04 100644 --- a/include/uapi/drm/habanalabs_accel.h +++ b/include/uapi/drm/habanalabs_accel.h @@ -708,7 +708,8 @@ enum hl_server_type { HL_SERVER_GAUDI_HLS1H = 2, HL_SERVER_GAUDI_TYPE1 = 3, HL_SERVER_GAUDI_TYPE2 = 4, - HL_SERVER_GAUDI2_HLS2 = 5 + HL_SERVER_GAUDI2_HLS2 = 5, + HL_SERVER_GAUDI2_TYPE1 = 7 }; /* |