Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'drm-habanalabs-next-2023-04-10' of https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux into drm-next

This tag contains additional habanalabs driver changes for v6.4:

- uAPI changes:
- Add a definition of a new Gaudi2 server type. This is used by userspace
to know what is the connectivity between the accelerators inside the
server

- New features and improvements:
- speedup h/w queues test in Gaudi2 to reduce device initialization times.

- Firmware related fixes:
- Fixes to the handshake protocol during f/w initialization.
- Sync f/w events interrupt in hard reset to avoid warning message.
- Improvements to extraction of the firmware version.

- Misc bug fixes and code cleanups. Notable fixes are:
- Multiple fixes for interrupt handling in Gaudi2.
- Unmap mapped memory in case TLB invalidation fails.

Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
From: Oded Gabbay <ogabbay@kernel.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20230410124637.GA2441888@ogabbay-vm-u20.habana-labs.com

+383 -270
+12 -3
drivers/accel/habanalabs/common/command_buffer.c
··· 45 45 } 46 46 47 47 mutex_lock(&hdev->mmu_lock); 48 + 48 49 rc = hl_mmu_map_contiguous(ctx, cb->virtual_addr, cb->bus_address, cb->roundup_size); 49 50 if (rc) { 50 51 dev_err(hdev->dev, "Failed to map VA %#llx to CB\n", cb->virtual_addr); 51 - goto err_va_umap; 52 + goto err_va_pool_free; 52 53 } 54 + 53 55 rc = hl_mmu_invalidate_cache(hdev, false, MMU_OP_USERPTR | MMU_OP_SKIP_LOW_CACHE_INV); 56 + if (rc) 57 + goto err_mmu_unmap; 58 + 54 59 mutex_unlock(&hdev->mmu_lock); 55 60 56 61 cb->is_mmu_mapped = true; 57 - return rc; 58 62 59 - err_va_umap: 63 + return 0; 64 + 65 + err_mmu_unmap: 66 + hl_mmu_unmap_contiguous(ctx, cb->virtual_addr, cb->roundup_size); 67 + err_va_pool_free: 60 68 mutex_unlock(&hdev->mmu_lock); 61 69 gen_pool_free(ctx->cb_va_pool, cb->virtual_addr, cb->roundup_size); 70 + 62 71 return rc; 63 72 } 64 73
+19 -21
drivers/accel/habanalabs/common/decoder.c
··· 43 43 intr_source[2], intr_source[3], intr_source[4], intr_source[5]); 44 44 } 45 45 46 - static void dec_error_intr_work(struct hl_device *hdev, u32 base_addr, u32 core_id) 46 + static void dec_abnrm_intr_work(struct work_struct *work) 47 47 { 48 + struct hl_dec *dec = container_of(work, struct hl_dec, abnrm_intr_work); 49 + struct hl_device *hdev = dec->hdev; 50 + u32 irq_status, event_mask = 0; 48 51 bool reset_required = false; 49 - u32 irq_status, event_mask; 50 52 51 - irq_status = RREG32(base_addr + VCMD_IRQ_STATUS_OFFSET); 53 + irq_status = RREG32(dec->base_addr + VCMD_IRQ_STATUS_OFFSET); 52 54 53 - dev_err(hdev->dev, "Decoder abnormal interrupt %#x, core %d\n", irq_status, core_id); 55 + dev_err(hdev->dev, "Decoder abnormal interrupt %#x, core %d\n", irq_status, dec->core_id); 54 56 55 57 dec_print_abnrm_intr_source(hdev, irq_status); 56 58 57 59 /* Clear the interrupt */ 58 - WREG32(base_addr + VCMD_IRQ_STATUS_OFFSET, irq_status); 60 + WREG32(dec->base_addr + VCMD_IRQ_STATUS_OFFSET, irq_status); 59 61 60 62 /* Flush the interrupt clear */ 61 - RREG32(base_addr + VCMD_IRQ_STATUS_OFFSET); 63 + RREG32(dec->base_addr + VCMD_IRQ_STATUS_OFFSET); 62 64 63 65 if (irq_status & VCMD_IRQ_STATUS_TIMEOUT_MASK) { 64 66 reset_required = true; 65 - event_mask = HL_NOTIFIER_EVENT_GENERAL_HW_ERR; 66 - } else if (irq_status & VCMD_IRQ_STATUS_CMDERR_MASK) { 67 - event_mask = HL_NOTIFIER_EVENT_UNDEFINED_OPCODE; 68 - } else { 69 - event_mask = HL_NOTIFIER_EVENT_USER_ENGINE_ERR; 67 + event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; 70 68 } 69 + 70 + if (irq_status & VCMD_IRQ_STATUS_CMDERR_MASK) 71 + event_mask |= HL_NOTIFIER_EVENT_UNDEFINED_OPCODE; 72 + 73 + if (irq_status & (VCMD_IRQ_STATUS_ENDCMD_MASK | 74 + VCMD_IRQ_STATUS_BUSERR_MASK | 75 + VCMD_IRQ_STATUS_ABORT_MASK)) 76 + event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; 71 77 72 78 if (reset_required) { 73 79 event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET; 74 80 hl_device_cond_reset(hdev, 0, event_mask); 75 - } else { 81 + } else if (event_mask) { 76 82 hl_notifier_event_send_all(hdev, event_mask); 77 83 } 78 - } 79 - 80 - static void dec_completion_abnrm(struct work_struct *work) 81 - { 82 - struct hl_dec *dec = container_of(work, struct hl_dec, completion_abnrm_work); 83 - struct hl_device *hdev = dec->hdev; 84 - 85 - dec_error_intr_work(hdev, dec->base_addr, dec->core_id); 86 84 } 87 85 88 86 void hl_dec_fini(struct hl_device *hdev) ··· 106 108 dec = hdev->dec + j; 107 109 108 110 dec->hdev = hdev; 109 - INIT_WORK(&dec->completion_abnrm_work, dec_completion_abnrm); 111 + INIT_WORK(&dec->abnrm_intr_work, dec_abnrm_intr_work); 110 112 dec->core_id = j; 111 113 dec->base_addr = hdev->asic_funcs->get_dec_base_addr(hdev, j); 112 114 if (!dec->base_addr) {
+29 -25
drivers/accel/habanalabs/common/device.c
··· 1271 1271 return 0; 1272 1272 1273 1273 disable_device: 1274 - pci_clear_master(hdev->pdev); 1275 1274 pci_disable_device(hdev->pdev); 1276 1275 1277 1276 return rc; ··· 1380 1381 mutex_unlock(fd_lock); 1381 1382 } 1382 1383 1384 + static void send_disable_pci_access(struct hl_device *hdev, u32 flags) 1385 + { 1386 + /* If reset is due to heartbeat, device CPU is no responsive in 1387 + * which case no point sending PCI disable message to it. 1388 + */ 1389 + if ((flags & HL_DRV_RESET_HARD) && 1390 + !(flags & (HL_DRV_RESET_HEARTBEAT | HL_DRV_RESET_BYPASS_REQ_TO_FW))) { 1391 + /* Disable PCI access from device F/W so he won't send 1392 + * us additional interrupts. We disable MSI/MSI-X at 1393 + * the halt_engines function and we can't have the F/W 1394 + * sending us interrupts after that. We need to disable 1395 + * the access here because if the device is marked 1396 + * disable, the message won't be send. Also, in case 1397 + * of heartbeat, the device CPU is marked as disable 1398 + * so this message won't be sent 1399 + */ 1400 + if (hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0)) { 1401 + dev_warn(hdev->dev, "Failed to disable FW's PCI access\n"); 1402 + return; 1403 + } 1404 + 1405 + /* verify that last EQs are handled before disabled is set */ 1406 + if (hdev->cpu_queues_enable) 1407 + synchronize_irq(pci_irq_vector(hdev->pdev, 1408 + hdev->asic_prop.eq_interrupt_id)); 1409 + } 1410 + } 1411 + 1383 1412 static void handle_reset_trigger(struct hl_device *hdev, u32 flags) 1384 1413 { 1385 1414 u32 cur_reset_trigger = HL_RESET_TRIGGER_DEFAULT; ··· 1445 1418 hdev->reset_info.reset_trigger_repeated = 0; 1446 1419 } else { 1447 1420 hdev->reset_info.reset_trigger_repeated = 1; 1448 - } 1449 - 1450 - /* If reset is due to heartbeat, device CPU is no responsive in 1451 - * which case no point sending PCI disable message to it. 1452 - * 1453 - * If F/W is performing the reset, no need to send it a message to disable 1454 - * PCI access 1455 - */ 1456 - if ((flags & HL_DRV_RESET_HARD) && 1457 - !(flags & (HL_DRV_RESET_HEARTBEAT | HL_DRV_RESET_BYPASS_REQ_TO_FW))) { 1458 - /* Disable PCI access from device F/W so he won't send 1459 - * us additional interrupts. We disable MSI/MSI-X at 1460 - * the halt_engines function and we can't have the F/W 1461 - * sending us interrupts after that. We need to disable 1462 - * the access here because if the device is marked 1463 - * disable, the message won't be send. Also, in case 1464 - * of heartbeat, the device CPU is marked as disable 1465 - * so this message won't be sent 1466 - */ 1467 - if (hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0)) 1468 - dev_warn(hdev->dev, 1469 - "Failed to disable FW's PCI access\n"); 1470 1421 } 1471 1422 } 1472 1423 ··· 1566 1561 1567 1562 escalate_reset_flow: 1568 1563 handle_reset_trigger(hdev, flags); 1564 + send_disable_pci_access(hdev, flags); 1569 1565 1570 1566 /* This also blocks future CS/VM/JOB completion operations */ 1571 1567 hdev->disabled = true; ··· 1829 1823 dev_info(hdev->dev, "Performing hard reset scheduled during compute reset\n"); 1830 1824 flags = hdev->reset_info.hard_reset_schedule_flags; 1831 1825 hdev->reset_info.hard_reset_schedule_flags = 0; 1832 - hdev->disabled = true; 1833 1826 hard_reset = true; 1834 - handle_reset_trigger(hdev, flags); 1835 1827 goto escalate_reset_flow; 1836 1828 } 1837 1829 }
+7 -10
drivers/accel/habanalabs/common/firmware_if.c
··· 71 71 return NULL; 72 72 } 73 73 74 - static int extract_fw_sub_versions(struct hl_device *hdev, char *preboot_ver) 74 + static int hl_get_preboot_major_minor(struct hl_device *hdev, char *preboot_ver) 75 75 { 76 76 char major[8], minor[8], *first_dot, *second_dot; 77 77 int rc; ··· 86 86 87 87 if (rc) { 88 88 dev_err(hdev->dev, "Error %d parsing preboot major version\n", rc); 89 - goto out; 89 + return rc; 90 90 } 91 91 92 92 /* skip the first dot */ ··· 102 102 103 103 if (rc) 104 104 dev_err(hdev->dev, "Error %d parsing preboot minor version\n", rc); 105 - 106 - out: 107 - kfree(preboot_ver); 108 105 return rc; 109 106 } 110 107 ··· 1260 1263 COMMS_RST_DEV, 0, false, 1261 1264 hdev->fw_loader.cpu_timeout); 1262 1265 if (rc) 1263 - dev_warn(hdev->dev, "Failed sending COMMS_RST_DEV\n"); 1266 + dev_err(hdev->dev, "Failed sending COMMS_RST_DEV\n"); 1264 1267 } else { 1265 1268 WREG32(static_loader->kmd_msg_to_cpu_reg, KMD_MSG_RST_DEV); 1266 1269 } ··· 1278 1281 /* Stop device CPU to make sure nothing bad happens */ 1279 1282 if (hdev->asic_prop.dynamic_fw_load) { 1280 1283 rc = hl_fw_dynamic_send_protocol_cmd(hdev, &hdev->fw_loader, 1281 - COMMS_GOTO_WFE, 0, true, 1284 + COMMS_GOTO_WFE, 0, false, 1282 1285 hdev->fw_loader.cpu_timeout); 1283 1286 if (rc) 1284 - dev_warn(hdev->dev, "Failed sending COMMS_GOTO_WFE\n"); 1287 + dev_err(hdev->dev, "Failed sending COMMS_GOTO_WFE\n"); 1285 1288 } else { 1286 1289 WREG32(static_loader->kmd_msg_to_cpu_reg, KMD_MSG_GOTO_WFE); 1287 1290 msleep(static_loader->cpu_reset_wait_msec); ··· 2178 2181 2179 2182 dev_info(hdev->dev, "preboot version %s\n", preboot_ver); 2180 2183 2181 - /* This function takes care of freeing preboot_ver */ 2182 - rc = extract_fw_sub_versions(hdev, preboot_ver); 2184 + rc = hl_get_preboot_major_minor(hdev, preboot_ver); 2185 + kfree(preboot_ver); 2183 2186 if (rc) 2184 2187 return rc; 2185 2188 }
+7 -7
drivers/accel/habanalabs/common/habanalabs.h
··· 662 662 * @user_interrupt_count: number of user interrupts. 663 663 * @user_dec_intr_count: number of decoder interrupts exposed to user. 664 664 * @tpc_interrupt_id: interrupt id for TPC to use in order to raise events towards the host. 665 - * @unexpected_user_error_interrupt_id: interrupt id used to indicate an unexpected user error. 665 + * @eq_interrupt_id: interrupt id for EQ, uses to synchronize EQ interrupts in hard-reset. 666 666 * @cache_line_size: device cache line size. 667 667 * @server_type: Server type that the ASIC is currently installed in. 668 668 * The value is according to enum hl_server_type in uapi file. ··· 793 793 u16 user_interrupt_count; 794 794 u16 user_dec_intr_count; 795 795 u16 tpc_interrupt_id; 796 - u16 unexpected_user_error_interrupt_id; 796 + u16 eq_interrupt_id; 797 797 u16 cache_line_size; 798 798 u16 server_type; 799 799 u8 completion_queues_count; ··· 1211 1211 /** 1212 1212 * struct hl_dec - describes a decoder sw instance. 1213 1213 * @hdev: pointer to the device structure. 1214 - * @completion_abnrm_work: workqueue object to run when decoder generates an error interrupt 1214 + * @abnrm_intr_work: workqueue work item to run when decoder generates an error interrupt. 1215 1215 * @core_id: ID of the decoder. 1216 1216 * @base_addr: base address of the decoder. 1217 1217 */ 1218 1218 struct hl_dec { 1219 - struct hl_device *hdev; 1220 - struct work_struct completion_abnrm_work; 1221 - u32 core_id; 1222 - u32 base_addr; 1219 + struct hl_device *hdev; 1220 + struct work_struct abnrm_intr_work; 1221 + u32 core_id; 1222 + u32 base_addr; 1223 1223 }; 1224 1224 1225 1225 /**
+7 -4
drivers/accel/habanalabs/common/irq.c
··· 415 415 struct hl_eq_entry *eq_base; 416 416 struct hl_eqe_work *handle_eqe_work; 417 417 bool entry_ready; 418 - u32 cur_eqe; 419 - u16 cur_eqe_index; 418 + u32 cur_eqe, ctl; 419 + u16 cur_eqe_index, event_type; 420 420 421 421 eq_base = eq->kernel_address; 422 422 ··· 449 449 dma_rmb(); 450 450 451 451 if (hdev->disabled && !hdev->reset_info.in_compute_reset) { 452 - dev_warn(hdev->dev, "Device disabled but received an EQ event\n"); 452 + ctl = le32_to_cpu(eq_entry->hdr.ctl); 453 + event_type = ((ctl & EQ_CTL_EVENT_TYPE_MASK) >> EQ_CTL_EVENT_TYPE_SHIFT); 454 + dev_warn(hdev->dev, 455 + "Device disabled but received an EQ event (%u)\n", event_type); 453 456 goto skip_irq; 454 457 } 455 458 ··· 489 486 { 490 487 struct hl_dec *dec = arg; 491 488 492 - schedule_work(&dec->completion_abnrm_work); 489 + schedule_work(&dec->abnrm_intr_work); 493 490 494 491 return IRQ_HANDLED; 495 492 }
+8 -3
drivers/accel/habanalabs/common/memory.c
··· 605 605 bool is_align_pow_2 = is_power_of_2(va_range->page_size); 606 606 bool is_hint_dram_addr = hl_is_dram_va(hdev, hint_addr); 607 607 bool force_hint = flags & HL_MEM_FORCE_HINT; 608 + int rc; 608 609 609 610 if (is_align_pow_2) 610 611 align_mask = ~((u64)va_block_align - 1); ··· 723 722 kfree(new_va_block); 724 723 } 725 724 726 - if (add_prev) 727 - add_va_block_locked(hdev, &va_range->list, prev_start, 728 - prev_end); 725 + if (add_prev) { 726 + rc = add_va_block_locked(hdev, &va_range->list, prev_start, prev_end); 727 + if (rc) { 728 + reserved_valid_start = 0; 729 + goto out; 730 + } 731 + } 729 732 730 733 print_va_list_locked(hdev, &va_range->list); 731 734 out:
+6 -2
drivers/accel/habanalabs/common/mmu/mmu.c
··· 679 679 680 680 rc = hdev->asic_funcs->mmu_invalidate_cache(hdev, is_hard, flags); 681 681 if (rc) 682 - dev_err_ratelimited(hdev->dev, "MMU cache invalidation failed\n"); 682 + dev_err_ratelimited(hdev->dev, 683 + "%s cache invalidation failed, rc=%d\n", 684 + flags == VM_TYPE_USERPTR ? "PMMU" : "HMMU", rc); 683 685 684 686 return rc; 685 687 } ··· 694 692 rc = hdev->asic_funcs->mmu_invalidate_cache_range(hdev, is_hard, flags, 695 693 asid, va, size); 696 694 if (rc) 697 - dev_err_ratelimited(hdev->dev, "MMU cache range invalidation failed\n"); 695 + dev_err_ratelimited(hdev->dev, 696 + "%s cache range invalidation failed: va=%#llx, size=%llu, rc=%d", 697 + flags == VM_TYPE_USERPTR ? "PMMU" : "HMMU", va, size, rc); 698 698 699 699 return rc; 700 700 }
-2
drivers/accel/habanalabs/common/pci/pci.c
··· 420 420 unmap_pci_bars: 421 421 hl_pci_bars_unmap(hdev); 422 422 disable_device: 423 - pci_clear_master(pdev); 424 423 pci_disable_device(pdev); 425 424 426 425 return rc; ··· 435 436 { 436 437 hl_pci_bars_unmap(hdev); 437 438 438 - pci_clear_master(hdev->pdev); 439 439 pci_disable_device(hdev->pdev); 440 440 }
+5 -1
drivers/accel/habanalabs/common/sysfs.c
··· 497 497 if (rc) { 498 498 dev_err(hdev->dev, 499 499 "Failed to add groups to device, error %d\n", rc); 500 - return rc; 500 + goto remove_groups; 501 501 } 502 502 503 503 return 0; 504 + 505 + remove_groups: 506 + device_remove_groups(hdev->dev, hl_dev_attr_groups); 507 + return rc; 504 508 } 505 509 506 510 void hl_sysfs_fini(struct hl_device *hdev)
+8 -78
drivers/accel/habanalabs/gaudi/gaudi.c
··· 682 682 prop->first_available_user_interrupt = USHRT_MAX; 683 683 prop->tpc_interrupt_id = USHRT_MAX; 684 684 685 + /* single msi */ 686 + prop->eq_interrupt_id = 0; 687 + 685 688 for (i = 0 ; i < HL_MAX_DCORES ; i++) 686 689 prop->first_available_cq[i] = USHRT_MAX; 687 690 ··· 2020 2017 return rc; 2021 2018 } 2022 2019 2023 - static int gaudi_enable_msi_multi(struct hl_device *hdev) 2024 - { 2025 - int cq_cnt = hdev->asic_prop.completion_queues_count; 2026 - int rc, i, irq_cnt_init, irq; 2027 - 2028 - for (i = 0, irq_cnt_init = 0 ; i < cq_cnt ; i++, irq_cnt_init++) { 2029 - irq = gaudi_pci_irq_vector(hdev, i, false); 2030 - rc = request_irq(irq, hl_irq_handler_cq, 0, gaudi_irq_name[i], 2031 - &hdev->completion_queue[i]); 2032 - if (rc) { 2033 - dev_err(hdev->dev, "Failed to request IRQ %d", irq); 2034 - goto free_irqs; 2035 - } 2036 - } 2037 - 2038 - irq = gaudi_pci_irq_vector(hdev, GAUDI_EVENT_QUEUE_MSI_IDX, true); 2039 - rc = request_irq(irq, hl_irq_handler_eq, 0, gaudi_irq_name[cq_cnt], 2040 - &hdev->event_queue); 2041 - if (rc) { 2042 - dev_err(hdev->dev, "Failed to request IRQ %d", irq); 2043 - goto free_irqs; 2044 - } 2045 - 2046 - return 0; 2047 - 2048 - free_irqs: 2049 - for (i = 0 ; i < irq_cnt_init ; i++) 2050 - free_irq(gaudi_pci_irq_vector(hdev, i, false), 2051 - &hdev->completion_queue[i]); 2052 - return rc; 2053 - } 2054 - 2055 2020 static int gaudi_enable_msi(struct hl_device *hdev) 2056 2021 { 2057 2022 struct gaudi_device *gaudi = hdev->asic_specific; ··· 2034 2063 return rc; 2035 2064 } 2036 2065 2037 - if (rc < NUMBER_OF_INTERRUPTS) { 2038 - gaudi->multi_msi_mode = false; 2039 - rc = gaudi_enable_msi_single(hdev); 2040 - } else { 2041 - gaudi->multi_msi_mode = true; 2042 - rc = gaudi_enable_msi_multi(hdev); 2043 - } 2044 - 2066 + rc = gaudi_enable_msi_single(hdev); 2045 2067 if (rc) 2046 2068 goto free_pci_irq_vectors; 2047 2069 ··· 2050 2086 static void gaudi_sync_irqs(struct hl_device *hdev) 2051 2087 { 2052 2088 struct gaudi_device *gaudi = hdev->asic_specific; 2053 - int i, cq_cnt = hdev->asic_prop.completion_queues_count; 2054 2089 2055 2090 if (!(gaudi->hw_cap_initialized & HW_CAP_MSI)) 2056 2091 return; 2057 2092 2058 2093 /* Wait for all pending IRQs to be finished */ 2059 - if (gaudi->multi_msi_mode) { 2060 - for (i = 0 ; i < cq_cnt ; i++) 2061 - synchronize_irq(gaudi_pci_irq_vector(hdev, i, false)); 2062 - 2063 - synchronize_irq(gaudi_pci_irq_vector(hdev, 2064 - GAUDI_EVENT_QUEUE_MSI_IDX, 2065 - true)); 2066 - } else { 2067 - synchronize_irq(gaudi_pci_irq_vector(hdev, 0, false)); 2068 - } 2094 + synchronize_irq(gaudi_pci_irq_vector(hdev, 0, false)); 2069 2095 } 2070 2096 2071 2097 static void gaudi_disable_msi(struct hl_device *hdev) 2072 2098 { 2073 2099 struct gaudi_device *gaudi = hdev->asic_specific; 2074 - int i, irq, cq_cnt = hdev->asic_prop.completion_queues_count; 2075 2100 2076 2101 if (!(gaudi->hw_cap_initialized & HW_CAP_MSI)) 2077 2102 return; 2078 2103 2079 2104 gaudi_sync_irqs(hdev); 2080 - 2081 - if (gaudi->multi_msi_mode) { 2082 - irq = gaudi_pci_irq_vector(hdev, GAUDI_EVENT_QUEUE_MSI_IDX, 2083 - true); 2084 - free_irq(irq, &hdev->event_queue); 2085 - 2086 - for (i = 0 ; i < cq_cnt ; i++) { 2087 - irq = gaudi_pci_irq_vector(hdev, i, false); 2088 - free_irq(irq, &hdev->completion_queue[i]); 2089 - } 2090 - } else { 2091 - free_irq(gaudi_pci_irq_vector(hdev, 0, false), hdev); 2092 - } 2093 - 2105 + free_irq(gaudi_pci_irq_vector(hdev, 0, false), hdev); 2094 2106 pci_free_irq_vectors(hdev->pdev); 2095 2107 2096 2108 gaudi->hw_cap_initialized &= ~HW_CAP_MSI; ··· 3861 3921 3862 3922 WREG32(mmCPU_IF_PF_PQ_PI, 0); 3863 3923 3864 - if (gaudi->multi_msi_mode) 3865 - WREG32(mmCPU_IF_QUEUE_INIT, PQ_INIT_STATUS_READY_FOR_CP); 3866 - else 3867 - WREG32(mmCPU_IF_QUEUE_INIT, 3868 - PQ_INIT_STATUS_READY_FOR_CP_SINGLE_MSI); 3924 + WREG32(mmCPU_IF_QUEUE_INIT, PQ_INIT_STATUS_READY_FOR_CP_SINGLE_MSI); 3869 3925 3870 3926 irq_handler_offset = prop->gic_interrupts_enable ? 3871 3927 mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR : ··· 5538 5602 u32 len, u32 original_len, u64 cq_addr, u32 cq_val, 5539 5603 u32 msi_vec, bool eb) 5540 5604 { 5541 - struct gaudi_device *gaudi = hdev->asic_specific; 5542 5605 struct packet_msg_prot *cq_pkt; 5543 5606 struct packet_nop *cq_padding; 5544 5607 u64 msi_addr; ··· 5567 5632 tmp |= FIELD_PREP(GAUDI_PKT_CTL_MB_MASK, 1); 5568 5633 cq_pkt->ctl = cpu_to_le32(tmp); 5569 5634 cq_pkt->value = cpu_to_le32(1); 5570 - 5571 - if (gaudi->multi_msi_mode) 5572 - msi_addr = mmPCIE_MSI_INTR_0 + msi_vec * 4; 5573 - else 5574 - msi_addr = mmPCIE_CORE_MSI_REQ; 5575 - 5635 + msi_addr = hdev->pdev ? mmPCIE_CORE_MSI_REQ : mmPCIE_MSI_INTR_0 + msi_vec * 4; 5576 5636 cq_pkt->addr = cpu_to_le64(CFG_BASE + msi_addr); 5577 5637 } 5578 5638
-15
drivers/accel/habanalabs/gaudi/gaudiP.h
··· 28 28 #define NUMBER_OF_COLLECTIVE_QUEUES 12 29 29 #define NUMBER_OF_SOBS_IN_GRP 11 30 30 31 - /* 32 - * Number of MSI interrupts IDS: 33 - * Each completion queue has 1 ID 34 - * The event queue has 1 ID 35 - */ 36 - #define NUMBER_OF_INTERRUPTS (NUMBER_OF_CMPLT_QUEUES + \ 37 - NUMBER_OF_CPU_HW_QUEUES) 38 - 39 31 #define GAUDI_STREAM_MASTER_ARR_SIZE 8 40 - 41 - #if (NUMBER_OF_INTERRUPTS > GAUDI_MSI_ENTRIES) 42 - #error "Number of MSI interrupts must be smaller or equal to GAUDI_MSI_ENTRIES" 43 - #endif 44 32 45 33 #define CORESIGHT_TIMEOUT_USEC 100000 /* 100 ms */ 46 34 ··· 312 324 * signal we can use this engine in later code paths. 313 325 * Each bit is cleared upon reset of its corresponding H/W 314 326 * engine. 315 - * @multi_msi_mode: whether we are working in multi MSI single MSI mode. 316 - * Multi MSI is possible only with IOMMU enabled. 317 327 * @mmu_cache_inv_pi: PI for MMU cache invalidation flow. The H/W expects an 318 328 * 8-bit value so use u8. 319 329 */ ··· 331 345 u32 events_stat[GAUDI_EVENT_SIZE]; 332 346 u32 events_stat_aggregate[GAUDI_EVENT_SIZE]; 333 347 u32 hw_cap_initialized; 334 - u8 multi_msi_mode; 335 348 u8 mmu_cache_inv_pi; 336 349 }; 337 350
+252 -97
drivers/accel/habanalabs/gaudi2/gaudi2.c
··· 2112 2112 static bool gaudi2_get_edma_idle_status(struct hl_device *hdev, u64 *mask_arr, u8 mask_len, 2113 2113 struct engines_data *e); 2114 2114 static u64 gaudi2_mmu_scramble_addr(struct hl_device *hdev, u64 raw_addr); 2115 + static u64 gaudi2_mmu_descramble_addr(struct hl_device *hdev, u64 scrambled_addr); 2115 2116 2116 2117 static void gaudi2_init_scrambler_hbm(struct hl_device *hdev) 2117 2118 { ··· 2439 2438 2440 2439 prop->first_available_user_interrupt = GAUDI2_IRQ_NUM_USER_FIRST; 2441 2440 prop->tpc_interrupt_id = GAUDI2_IRQ_NUM_TPC_ASSERT; 2442 - prop->unexpected_user_error_interrupt_id = GAUDI2_IRQ_NUM_UNEXPECTED_ERROR; 2441 + prop->eq_interrupt_id = GAUDI2_IRQ_NUM_EVENT_QUEUE; 2443 2442 2444 2443 prop->first_available_cq[0] = GAUDI2_RESERVED_CQ_NUMBER; 2445 2444 ··· 2887 2886 hdev->edma_binning = prop->cpucp_info.edma_binning_mask; 2888 2887 hdev->tpc_binning = le64_to_cpu(prop->cpucp_info.tpc_binning_mask); 2889 2888 hdev->decoder_binning = lower_32_bits(le64_to_cpu(prop->cpucp_info.decoder_binning_mask)); 2889 + 2890 + dev_dbg(hdev->dev, "Read binning masks: tpc: 0x%llx, dram: 0x%llx, edma: 0x%x, dec: 0x%x\n", 2891 + hdev->tpc_binning, hdev->dram_binning, hdev->edma_binning, 2892 + hdev->decoder_binning); 2890 2893 2891 2894 /* 2892 2895 * at this point the DRAM parameters need to be updated according to data obtained ··· 3350 3345 /* Initialize TPC interrupt */ 3351 3346 HL_USR_INTR_STRUCT_INIT(hdev->tpc_interrupt, hdev, 0, HL_USR_INTERRUPT_TPC); 3352 3347 3353 - /* Initialize general purpose interrupt */ 3348 + /* Initialize unexpected error interrupt */ 3354 3349 HL_USR_INTR_STRUCT_INIT(hdev->unexpected_error_interrupt, hdev, 0, 3355 3350 HL_USR_INTERRUPT_UNEXPECTED); 3356 3351 ··· 3480 3475 return gaudi2_special_blocks_config(hdev); 3481 3476 } 3482 3477 3478 + static void gaudi2_test_queues_msgs_free(struct hl_device *hdev) 3479 + { 3480 + struct gaudi2_device *gaudi2 = hdev->asic_specific; 3481 + struct gaudi2_queues_test_info *msg_info = gaudi2->queues_test_info; 3482 + int i; 3483 + 3484 + for (i = 0 ; i < GAUDI2_NUM_TESTED_QS ; i++) { 3485 + /* bail-out if this is an allocation failure point */ 3486 + if (!msg_info[i].kern_addr) 3487 + break; 3488 + 3489 + hl_asic_dma_pool_free(hdev, msg_info[i].kern_addr, msg_info[i].dma_addr); 3490 + msg_info[i].kern_addr = NULL; 3491 + } 3492 + } 3493 + 3494 + static int gaudi2_test_queues_msgs_alloc(struct hl_device *hdev) 3495 + { 3496 + struct gaudi2_device *gaudi2 = hdev->asic_specific; 3497 + struct gaudi2_queues_test_info *msg_info = gaudi2->queues_test_info; 3498 + int i, rc; 3499 + 3500 + /* allocate a message-short buf for each Q we intend to test */ 3501 + for (i = 0 ; i < GAUDI2_NUM_TESTED_QS ; i++) { 3502 + msg_info[i].kern_addr = 3503 + (void *)hl_asic_dma_pool_zalloc(hdev, sizeof(struct packet_msg_short), 3504 + GFP_KERNEL, &msg_info[i].dma_addr); 3505 + if (!msg_info[i].kern_addr) { 3506 + dev_err(hdev->dev, 3507 + "Failed to allocate dma memory for H/W queue %d testing\n", i); 3508 + rc = -ENOMEM; 3509 + goto err_exit; 3510 + } 3511 + } 3512 + 3513 + return 0; 3514 + 3515 + err_exit: 3516 + gaudi2_test_queues_msgs_free(hdev); 3517 + return rc; 3518 + } 3519 + 3483 3520 static int gaudi2_sw_init(struct hl_device *hdev) 3484 3521 { 3485 3522 struct asic_fixed_properties *prop = &hdev->asic_prop; ··· 3621 3574 if (rc) 3622 3575 goto free_scratchpad_mem; 3623 3576 3577 + rc = gaudi2_test_queues_msgs_alloc(hdev); 3578 + if (rc) 3579 + goto special_blocks_free; 3580 + 3624 3581 return 0; 3625 3582 3583 + special_blocks_free: 3584 + gaudi2_special_blocks_iterator_free(hdev); 3626 3585 free_scratchpad_mem: 3627 3586 hl_asic_dma_pool_free(hdev, gaudi2->scratchpad_kernel_address, 3628 3587 gaudi2->scratchpad_bus_address); ··· 3650 3597 { 3651 3598 struct asic_fixed_properties *prop = &hdev->asic_prop; 3652 3599 struct gaudi2_device *gaudi2 = hdev->asic_specific; 3600 + 3601 + gaudi2_test_queues_msgs_free(hdev); 3653 3602 3654 3603 gaudi2_special_blocks_iterator_free(hdev); 3655 3604 ··· 4064 4009 case GAUDI2_IRQ_NUM_TPC_ASSERT: 4065 4010 return "gaudi2 tpc assert"; 4066 4011 case GAUDI2_IRQ_NUM_UNEXPECTED_ERROR: 4067 - return "gaudi2 tpc assert"; 4012 + return "gaudi2 unexpected error"; 4068 4013 case GAUDI2_IRQ_NUM_USER_FIRST ... GAUDI2_IRQ_NUM_USER_LAST: 4069 4014 return "gaudi2 user completion"; 4070 4015 default: ··· 6847 6792 } 6848 6793 } 6849 6794 6850 - static int gaudi2_test_queue(struct hl_device *hdev, u32 hw_queue_id) 6795 + static inline u32 gaudi2_test_queue_hw_queue_id_to_sob_id(struct hl_device *hdev, u32 hw_queue_id) 6851 6796 { 6852 - u32 sob_offset = hdev->asic_prop.first_available_user_sob[0] * 4; 6797 + return hdev->asic_prop.first_available_user_sob[0] + 6798 + hw_queue_id - GAUDI2_QUEUE_ID_PDMA_0_0; 6799 + } 6800 + 6801 + static void gaudi2_test_queue_clear(struct hl_device *hdev, u32 hw_queue_id) 6802 + { 6803 + u32 sob_offset = gaudi2_test_queue_hw_queue_id_to_sob_id(hdev, hw_queue_id) * 4; 6853 6804 u32 sob_addr = mmDCORE0_SYNC_MNGR_OBJS_SOB_OBJ_0 + sob_offset; 6854 - u32 timeout_usec, tmp, sob_base = 1, sob_val = 0x5a5a; 6855 - struct packet_msg_short *msg_short_pkt; 6856 - dma_addr_t pkt_dma_addr; 6857 - size_t pkt_size; 6805 + 6806 + /* Reset the SOB value */ 6807 + WREG32(sob_addr, 0); 6808 + } 6809 + 6810 + static int gaudi2_test_queue_send_msg_short(struct hl_device *hdev, u32 hw_queue_id, u32 sob_val, 6811 + struct gaudi2_queues_test_info *msg_info) 6812 + { 6813 + u32 sob_offset = gaudi2_test_queue_hw_queue_id_to_sob_id(hdev, hw_queue_id) * 4; 6814 + u32 tmp, sob_base = 1; 6815 + struct packet_msg_short *msg_short_pkt = msg_info->kern_addr; 6816 + size_t pkt_size = sizeof(struct packet_msg_short); 6858 6817 int rc; 6859 - 6860 - if (hdev->pldm) 6861 - timeout_usec = GAUDI2_PLDM_TEST_QUEUE_WAIT_USEC; 6862 - else 6863 - timeout_usec = GAUDI2_TEST_QUEUE_WAIT_USEC; 6864 - 6865 - pkt_size = sizeof(*msg_short_pkt); 6866 - msg_short_pkt = hl_asic_dma_pool_zalloc(hdev, pkt_size, GFP_KERNEL, &pkt_dma_addr); 6867 - if (!msg_short_pkt) { 6868 - dev_err(hdev->dev, "Failed to allocate packet for H/W queue %d testing\n", 6869 - hw_queue_id); 6870 - return -ENOMEM; 6871 - } 6872 6818 6873 6819 tmp = (PACKET_MSG_SHORT << GAUDI2_PKT_CTL_OPCODE_SHIFT) | 6874 6820 (1 << GAUDI2_PKT_CTL_EB_SHIFT) | ··· 6880 6824 msg_short_pkt->value = cpu_to_le32(sob_val); 6881 6825 msg_short_pkt->ctl = cpu_to_le32(tmp); 6882 6826 6883 - /* Reset the SOB value */ 6884 - WREG32(sob_addr, 0); 6827 + rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id, pkt_size, msg_info->dma_addr); 6828 + if (rc) 6829 + dev_err(hdev->dev, 6830 + "Failed to send msg_short packet to H/W queue %d\n", hw_queue_id); 6885 6831 6886 - rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id, pkt_size, pkt_dma_addr); 6887 - if (rc) { 6888 - dev_err(hdev->dev, "Failed to send msg_short packet to H/W queue %d\n", 6889 - hw_queue_id); 6890 - goto free_pkt; 6891 - } 6832 + return rc; 6833 + } 6834 + 6835 + static int gaudi2_test_queue_wait_completion(struct hl_device *hdev, u32 hw_queue_id, u32 sob_val) 6836 + { 6837 + u32 sob_offset = gaudi2_test_queue_hw_queue_id_to_sob_id(hdev, hw_queue_id) * 4; 6838 + u32 sob_addr = mmDCORE0_SYNC_MNGR_OBJS_SOB_OBJ_0 + sob_offset; 6839 + u32 timeout_usec, tmp; 6840 + int rc; 6841 + 6842 + if (hdev->pldm) 6843 + timeout_usec = GAUDI2_PLDM_TEST_QUEUE_WAIT_USEC; 6844 + else 6845 + timeout_usec = GAUDI2_TEST_QUEUE_WAIT_USEC; 6892 6846 6893 6847 rc = hl_poll_timeout( 6894 6848 hdev, ··· 6914 6848 rc = -EIO; 6915 6849 } 6916 6850 6917 - /* Reset the SOB value */ 6918 - WREG32(sob_addr, 0); 6919 - 6920 - free_pkt: 6921 - hl_asic_dma_pool_free(hdev, (void *) msg_short_pkt, pkt_dma_addr); 6922 6851 return rc; 6923 6852 } 6924 6853 ··· 6933 6872 6934 6873 static int gaudi2_test_queues(struct hl_device *hdev) 6935 6874 { 6936 - int i, rc, ret_val = 0; 6875 + struct gaudi2_device *gaudi2 = hdev->asic_specific; 6876 + struct gaudi2_queues_test_info *msg_info; 6877 + u32 sob_val = 0x5a5a; 6878 + int i, rc; 6937 6879 6880 + /* send test message on all enabled Qs */ 6938 6881 for (i = GAUDI2_QUEUE_ID_PDMA_0_0 ; i < GAUDI2_QUEUE_ID_CPU_PQ; i++) { 6939 6882 if (!gaudi2_is_queue_enabled(hdev, i)) 6940 6883 continue; 6941 6884 6885 + msg_info = &gaudi2->queues_test_info[i - GAUDI2_QUEUE_ID_PDMA_0_0]; 6942 6886 gaudi2_qman_set_test_mode(hdev, i, true); 6943 - rc = gaudi2_test_queue(hdev, i); 6944 - gaudi2_qman_set_test_mode(hdev, i, false); 6945 - 6946 - if (rc) { 6947 - ret_val = -EINVAL; 6887 + gaudi2_test_queue_clear(hdev, i); 6888 + rc = gaudi2_test_queue_send_msg_short(hdev, i, sob_val, msg_info); 6889 + if (rc) 6948 6890 goto done; 6949 - } 6950 6891 } 6951 6892 6952 6893 rc = gaudi2_test_cpu_queue(hdev); 6953 - if (rc) { 6954 - ret_val = -EINVAL; 6894 + if (rc) 6955 6895 goto done; 6896 + 6897 + /* verify that all messages were processed */ 6898 + for (i = GAUDI2_QUEUE_ID_PDMA_0_0 ; i < GAUDI2_QUEUE_ID_CPU_PQ; i++) { 6899 + if (!gaudi2_is_queue_enabled(hdev, i)) 6900 + continue; 6901 + 6902 + rc = gaudi2_test_queue_wait_completion(hdev, i, sob_val); 6903 + if (rc) 6904 + /* chip is not usable, no need for cleanups, just bail-out with error */ 6905 + goto done; 6906 + 6907 + gaudi2_test_queue_clear(hdev, i); 6908 + gaudi2_qman_set_test_mode(hdev, i, false); 6956 6909 } 6957 6910 6958 6911 done: 6959 - return ret_val; 6912 + return rc; 6960 6913 } 6961 6914 6962 6915 static int gaudi2_compute_reset_late_init(struct hl_device *hdev) ··· 8560 8485 8561 8486 static int gaudi2_handle_arc_farm_sei_err(struct hl_device *hdev, u16 event_type) 8562 8487 { 8563 - u32 i, sts_val, sts_clr_val = 0, error_count = 0; 8488 + u32 i, sts_val, sts_clr_val, error_count = 0, arc_farm; 8564 8489 8565 - sts_val = RREG32(mmARC_FARM_ARC0_AUX_ARC_SEI_INTR_STS); 8490 + for (arc_farm = 0 ; arc_farm < NUM_OF_ARC_FARMS_ARC ; arc_farm++) { 8491 + sts_clr_val = 0; 8492 + sts_val = RREG32(mmARC_FARM_ARC0_AUX_ARC_SEI_INTR_STS + 8493 + (arc_farm * ARC_FARM_OFFSET)); 8566 8494 8567 - for (i = 0 ; i < GAUDI2_NUM_OF_ARC_SEI_ERR_CAUSE ; i++) { 8568 - if (sts_val & BIT(i)) { 8569 - gaudi2_print_event(hdev, event_type, true, 8570 - "err cause: %s", gaudi2_arc_sei_error_cause[i]); 8571 - sts_clr_val |= BIT(i); 8572 - error_count++; 8495 + for (i = 0 ; i < GAUDI2_NUM_OF_ARC_SEI_ERR_CAUSE ; i++) { 8496 + if (sts_val & BIT(i)) { 8497 + gaudi2_print_event(hdev, event_type, true, 8498 + "ARC FARM ARC %u err cause: %s", 8499 + arc_farm, gaudi2_arc_sei_error_cause[i]); 8500 + sts_clr_val |= BIT(i); 8501 + error_count++; 8502 + } 8573 8503 } 8504 + WREG32(mmARC_FARM_ARC0_AUX_ARC_SEI_INTR_CLR + (arc_farm * ARC_FARM_OFFSET), 8505 + sts_clr_val); 8574 8506 } 8575 8507 8576 8508 hl_check_for_glbl_errors(hdev); 8577 - 8578 - WREG32(mmARC_FARM_ARC0_AUX_ARC_SEI_INTR_CLR, sts_clr_val); 8579 8509 8580 8510 return error_count; 8581 8511 } ··· 8924 8844 static void gaudi2_handle_page_error(struct hl_device *hdev, u64 mmu_base, bool is_pmmu, 8925 8845 u64 *event_mask) 8926 8846 { 8927 - u32 valid, val, axid_l, axid_h; 8847 + u32 valid, val; 8928 8848 u64 addr; 8929 8849 8930 8850 valid = RREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_ACCESS_PAGE_ERROR_VALID)); ··· 8937 8857 addr <<= 32; 8938 8858 addr |= RREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_PAGE_ERROR_CAPTURE_VA)); 8939 8859 8940 - axid_l = RREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_PAGE_FAULT_ID_LSB)); 8941 - axid_h = RREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_PAGE_FAULT_ID_MSB)); 8860 + if (!is_pmmu) 8861 + addr = gaudi2_mmu_descramble_addr(hdev, addr); 8942 8862 8943 - dev_err_ratelimited(hdev->dev, "%s page fault on va 0x%llx, transaction id 0x%llX\n", 8944 - is_pmmu ? "PMMU" : "HMMU", addr, ((u64)axid_h << 32) + axid_l); 8863 + dev_err_ratelimited(hdev->dev, "%s page fault on va 0x%llx\n", 8864 + is_pmmu ? "PMMU" : "HMMU", addr); 8945 8865 hl_handle_page_fault(hdev, addr, 0, is_pmmu, event_mask); 8946 8866 8947 8867 WREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_ACCESS_PAGE_ERROR_VALID), 0); ··· 8962 8882 addr <<= 32; 8963 8883 addr |= RREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_ACCESS_ERROR_CAPTURE_VA)); 8964 8884 8885 + if (!is_pmmu) 8886 + addr = gaudi2_mmu_descramble_addr(hdev, addr); 8887 + 8965 8888 dev_err_ratelimited(hdev->dev, "%s access error on va 0x%llx\n", 8966 8889 is_pmmu ? "PMMU" : "HMMU", addr); 8967 - WREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_ACCESS_ERROR_CAPTURE), 0); 8890 + WREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_ACCESS_PAGE_ERROR_VALID), 0); 8968 8891 } 8969 8892 8970 8893 static int gaudi2_handle_mmu_spi_sei_generic(struct hl_device *hdev, u16 event_type, ··· 9059 8976 return error_count; 9060 8977 } 9061 8978 8979 + static u64 get_hmmu_base(u16 event_type) 8980 + { 8981 + u8 dcore, index_in_dcore; 8982 + 8983 + switch (event_type) { 8984 + case GAUDI2_EVENT_HMMU_0_AXI_ERR_RSP: 8985 + case GAUDI2_EVENT_HMMU0_SPI_BASE ... GAUDI2_EVENT_HMMU0_SECURITY_ERROR: 8986 + dcore = 0; 8987 + index_in_dcore = 0; 8988 + break; 8989 + case GAUDI2_EVENT_HMMU_1_AXI_ERR_RSP: 8990 + case GAUDI2_EVENT_HMMU1_SPI_BASE ... GAUDI2_EVENT_HMMU1_SECURITY_ERROR: 8991 + dcore = 1; 8992 + index_in_dcore = 0; 8993 + break; 8994 + case GAUDI2_EVENT_HMMU_2_AXI_ERR_RSP: 8995 + case GAUDI2_EVENT_HMMU2_SPI_BASE ... GAUDI2_EVENT_HMMU2_SECURITY_ERROR: 8996 + dcore = 0; 8997 + index_in_dcore = 1; 8998 + break; 8999 + case GAUDI2_EVENT_HMMU_3_AXI_ERR_RSP: 9000 + case GAUDI2_EVENT_HMMU3_SPI_BASE ... GAUDI2_EVENT_HMMU3_SECURITY_ERROR: 9001 + dcore = 1; 9002 + index_in_dcore = 1; 9003 + break; 9004 + case GAUDI2_EVENT_HMMU_4_AXI_ERR_RSP: 9005 + case GAUDI2_EVENT_HMMU4_SPI_BASE ... GAUDI2_EVENT_HMMU4_SECURITY_ERROR: 9006 + dcore = 3; 9007 + index_in_dcore = 2; 9008 + break; 9009 + case GAUDI2_EVENT_HMMU_5_AXI_ERR_RSP: 9010 + case GAUDI2_EVENT_HMMU5_SPI_BASE ... GAUDI2_EVENT_HMMU5_SECURITY_ERROR: 9011 + dcore = 2; 9012 + index_in_dcore = 2; 9013 + break; 9014 + case GAUDI2_EVENT_HMMU_6_AXI_ERR_RSP: 9015 + case GAUDI2_EVENT_HMMU6_SPI_BASE ... GAUDI2_EVENT_HMMU6_SECURITY_ERROR: 9016 + dcore = 3; 9017 + index_in_dcore = 3; 9018 + break; 9019 + case GAUDI2_EVENT_HMMU_7_AXI_ERR_RSP: 9020 + case GAUDI2_EVENT_HMMU7_SPI_BASE ... GAUDI2_EVENT_HMMU7_SECURITY_ERROR: 9021 + dcore = 2; 9022 + index_in_dcore = 3; 9023 + break; 9024 + case GAUDI2_EVENT_HMMU_8_AXI_ERR_RSP: 9025 + case GAUDI2_EVENT_HMMU8_SPI_BASE ... GAUDI2_EVENT_HMMU8_SECURITY_ERROR: 9026 + dcore = 0; 9027 + index_in_dcore = 2; 9028 + break; 9029 + case GAUDI2_EVENT_HMMU_9_AXI_ERR_RSP: 9030 + case GAUDI2_EVENT_HMMU9_SPI_BASE ... GAUDI2_EVENT_HMMU9_SECURITY_ERROR: 9031 + dcore = 1; 9032 + index_in_dcore = 2; 9033 + break; 9034 + case GAUDI2_EVENT_HMMU_10_AXI_ERR_RSP: 9035 + case GAUDI2_EVENT_HMMU10_SPI_BASE ... GAUDI2_EVENT_HMMU10_SECURITY_ERROR: 9036 + dcore = 0; 9037 + index_in_dcore = 3; 9038 + break; 9039 + case GAUDI2_EVENT_HMMU_11_AXI_ERR_RSP: 9040 + case GAUDI2_EVENT_HMMU11_SPI_BASE ... GAUDI2_EVENT_HMMU11_SECURITY_ERROR: 9041 + dcore = 1; 9042 + index_in_dcore = 3; 9043 + break; 9044 + case GAUDI2_EVENT_HMMU_12_AXI_ERR_RSP: 9045 + case GAUDI2_EVENT_HMMU12_SPI_BASE ... GAUDI2_EVENT_HMMU12_SECURITY_ERROR: 9046 + dcore = 3; 9047 + index_in_dcore = 0; 9048 + break; 9049 + case GAUDI2_EVENT_HMMU_13_AXI_ERR_RSP: 9050 + case GAUDI2_EVENT_HMMU13_SPI_BASE ... GAUDI2_EVENT_HMMU13_SECURITY_ERROR: 9051 + dcore = 2; 9052 + index_in_dcore = 0; 9053 + break; 9054 + case GAUDI2_EVENT_HMMU_14_AXI_ERR_RSP: 9055 + case GAUDI2_EVENT_HMMU14_SPI_BASE ... GAUDI2_EVENT_HMMU14_SECURITY_ERROR: 9056 + dcore = 3; 9057 + index_in_dcore = 1; 9058 + break; 9059 + case GAUDI2_EVENT_HMMU_15_AXI_ERR_RSP: 9060 + case GAUDI2_EVENT_HMMU15_SPI_BASE ... GAUDI2_EVENT_HMMU15_SECURITY_ERROR: 9061 + dcore = 2; 9062 + index_in_dcore = 1; 9063 + break; 9064 + default: 9065 + return ULONG_MAX; 9066 + } 9067 + 9068 + return mmDCORE0_HMMU0_MMU_BASE + dcore * DCORE_OFFSET + index_in_dcore * DCORE_HMMU_OFFSET; 9069 + } 9070 + 9062 9071 static int gaudi2_handle_mmu_spi_sei_err(struct hl_device *hdev, u16 event_type, u64 *event_mask) 9063 9072 { 9064 9073 bool is_pmmu = false; 9065 9074 u32 error_count = 0; 9066 9075 u64 mmu_base; 9067 - u8 index; 9068 9076 9069 9077 switch (event_type) { 9070 - case GAUDI2_EVENT_HMMU0_PAGE_FAULT_OR_WR_PERM ... GAUDI2_EVENT_HMMU3_SECURITY_ERROR: 9071 - index = (event_type - GAUDI2_EVENT_HMMU0_PAGE_FAULT_OR_WR_PERM) / 3; 9072 - mmu_base = mmDCORE0_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET; 9078 + case GAUDI2_EVENT_HMMU_0_AXI_ERR_RSP ... GAUDI2_EVENT_HMMU_12_AXI_ERR_RSP: 9079 + case GAUDI2_EVENT_HMMU0_SPI_BASE ... GAUDI2_EVENT_HMMU12_SECURITY_ERROR: 9080 + mmu_base = get_hmmu_base(event_type); 9073 9081 break; 9074 - case GAUDI2_EVENT_HMMU_0_AXI_ERR_RSP ... GAUDI2_EVENT_HMMU_3_AXI_ERR_RSP: 9075 - index = (event_type - GAUDI2_EVENT_HMMU_0_AXI_ERR_RSP); 9076 - mmu_base = mmDCORE0_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET; 9077 - break; 9078 - case GAUDI2_EVENT_HMMU8_PAGE_FAULT_WR_PERM ... GAUDI2_EVENT_HMMU11_SECURITY_ERROR: 9079 - index = (event_type - GAUDI2_EVENT_HMMU8_PAGE_FAULT_WR_PERM) / 3; 9080 - mmu_base = mmDCORE1_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET; 9081 - break; 9082 - case GAUDI2_EVENT_HMMU_8_AXI_ERR_RSP ... GAUDI2_EVENT_HMMU_11_AXI_ERR_RSP: 9083 - index = (event_type - GAUDI2_EVENT_HMMU_8_AXI_ERR_RSP); 9084 - mmu_base = mmDCORE1_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET; 9085 - break; 9086 - case GAUDI2_EVENT_HMMU7_PAGE_FAULT_WR_PERM ... GAUDI2_EVENT_HMMU4_SECURITY_ERROR: 9087 - index = (event_type - GAUDI2_EVENT_HMMU7_PAGE_FAULT_WR_PERM) / 3; 9088 - mmu_base = mmDCORE2_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET; 9089 - break; 9090 - case GAUDI2_EVENT_HMMU_7_AXI_ERR_RSP ... GAUDI2_EVENT_HMMU_4_AXI_ERR_RSP: 9091 - index = (event_type - GAUDI2_EVENT_HMMU_7_AXI_ERR_RSP); 9092 - mmu_base = mmDCORE2_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET; 9093 - break; 9094 - case GAUDI2_EVENT_HMMU15_PAGE_FAULT_WR_PERM ... GAUDI2_EVENT_HMMU12_SECURITY_ERROR: 9095 - index = (event_type - GAUDI2_EVENT_HMMU15_PAGE_FAULT_WR_PERM) / 3; 9096 - mmu_base = mmDCORE3_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET; 9097 - break; 9098 - case GAUDI2_EVENT_HMMU_15_AXI_ERR_RSP ... GAUDI2_EVENT_HMMU_12_AXI_ERR_RSP: 9099 - index = (event_type - GAUDI2_EVENT_HMMU_15_AXI_ERR_RSP); 9100 - mmu_base = mmDCORE3_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET; 9101 - break; 9082 + 9102 9083 case GAUDI2_EVENT_PMMU0_PAGE_FAULT_WR_PERM ... GAUDI2_EVENT_PMMU0_SECURITY_ERROR: 9103 9084 case GAUDI2_EVENT_PMMU_AXI_ERR_RSP_0: 9104 9085 is_pmmu = true; ··· 9171 9024 default: 9172 9025 return 0; 9173 9026 } 9027 + 9028 + if (mmu_base == ULONG_MAX) 9029 + return 0; 9174 9030 9175 9031 error_count = gaudi2_handle_mmu_spi_sei_generic(hdev, event_type, mmu_base, 9176 9032 is_pmmu, event_mask); ··· 9585 9435 break; 9586 9436 9587 9437 case GAUDI2_EVENT_ARC_AXI_ERROR_RESPONSE_0: 9588 - reset_flags |= HL_DRV_RESET_FW_FATAL_ERR; 9589 9438 error_count = gaudi2_handle_arc_farm_sei_err(hdev, event_type); 9590 - event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; 9439 + event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; 9591 9440 break; 9592 9441 9593 9442 case GAUDI2_EVENT_CPU_AXI_ERR_RSP: 9594 9443 error_count = gaudi2_handle_cpu_sei_err(hdev, event_type); 9595 - event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; 9444 + reset_flags |= HL_DRV_RESET_FW_FATAL_ERR; 9445 + event_mask |= HL_NOTIFIER_EVENT_CRITICL_FW_ERR; 9596 9446 break; 9597 9447 9598 9448 case GAUDI2_EVENT_PDMA_CH0_AXI_ERR_RSP: 9599 9449 case GAUDI2_EVENT_PDMA_CH1_AXI_ERR_RSP: 9600 - reset_flags |= HL_DRV_RESET_FW_FATAL_ERR; 9601 9450 error_count = gaudi2_handle_qm_sei_err(hdev, event_type, true, &event_mask); 9602 9451 event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; 9603 9452 break; ··· 9783 9634 9784 9635 case GAUDI2_EVENT_PCIE_DRAIN_COMPLETE: 9785 9636 error_count = gaudi2_handle_pcie_drain(hdev, &eq_entry->pcie_drain_ind_data); 9637 + reset_flags |= HL_DRV_RESET_FW_FATAL_ERR; 9786 9638 event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; 9787 9639 break; 9788 9640 9789 9641 case GAUDI2_EVENT_PSOC59_RPM_ERROR_OR_DRAIN: 9790 9642 error_count = gaudi2_handle_psoc_drain(hdev, 9791 9643 le64_to_cpu(eq_entry->intr_cause.intr_cause_data)); 9644 + reset_flags |= HL_DRV_RESET_FW_FATAL_ERR; 9792 9645 event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; 9793 9646 break; 9794 9647 ··· 9819 9668 break; 9820 9669 case GAUDI2_EVENT_PSOC_AXI_ERR_RSP: 9821 9670 error_count = GAUDI2_NA_EVENT_CAUSE; 9671 + reset_flags |= HL_DRV_RESET_FW_FATAL_ERR; 9822 9672 event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; 9823 9673 break; 9824 9674 case GAUDI2_EVENT_PSOC_PRSTN_FALL: ··· 9833 9681 break; 9834 9682 case GAUDI2_EVENT_PCIE_FATAL_ERR: 9835 9683 error_count = GAUDI2_NA_EVENT_CAUSE; 9684 + reset_flags |= HL_DRV_RESET_FW_FATAL_ERR; 9836 9685 event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; 9837 9686 break; 9838 9687 case GAUDI2_EVENT_TPC0_BMON_SPMU: ··· 9901 9748 case GAUDI2_EVENT_CPU_PKT_QUEUE_OUT_SYNC: 9902 9749 gaudi2_print_out_of_sync_info(hdev, event_type, &eq_entry->pkt_sync_err); 9903 9750 error_count = GAUDI2_NA_EVENT_CAUSE; 9751 + reset_flags |= HL_DRV_RESET_FW_FATAL_ERR; 9904 9752 event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; 9905 9753 break; 9906 9754 ··· 9943 9789 case GAUDI2_EVENT_CPU_PKT_SANITY_FAILED: 9944 9790 gaudi2_print_cpu_pkt_failure_info(hdev, event_type, &eq_entry->pkt_sync_err); 9945 9791 error_count = GAUDI2_NA_EVENT_CAUSE; 9792 + reset_flags |= HL_DRV_RESET_FW_FATAL_ERR; 9946 9793 event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; 9947 9794 break; 9948 9795
+17
drivers/accel/habanalabs/gaudi2/gaudi2P.h
··· 240 240 #define GAUDI2_SOB_INCREMENT_BY_ONE (FIELD_PREP(DCORE0_SYNC_MNGR_OBJS_SOB_OBJ_VAL_MASK, 1) | \ 241 241 FIELD_PREP(DCORE0_SYNC_MNGR_OBJS_SOB_OBJ_INC_MASK, 1)) 242 242 243 + #define GAUDI2_NUM_TESTED_QS (GAUDI2_QUEUE_ID_CPU_PQ - GAUDI2_QUEUE_ID_PDMA_0_0) 244 + 243 245 #define GAUDI2_NUM_OF_GLBL_ERR_CAUSE 8 244 246 245 247 enum gaudi2_reserved_sob_id { ··· 455 453 }; 456 454 457 455 /** 456 + * struct gaudi2_queues_test_info - Holds the address of a the messages used for testing the 457 + * device queues. 458 + * @dma_addr: the address used by the HW for accessing the message. 459 + * @kern_addr: The address used by the driver for accessing the message. 460 + */ 461 + struct gaudi2_queues_test_info { 462 + dma_addr_t dma_addr; 463 + void *kern_addr; 464 + }; 465 + 466 + /** 458 467 * struct gaudi2_device - ASIC specific manage structure. 459 468 * @cpucp_info_get: get information on device from CPU-CP 460 469 * @mapped_blocks: array that holds the base address and size of all blocks ··· 523 510 * @flush_db_fifo: flag to force flush DB FIFO after a write. 524 511 * @hbm_cfg: HBM subsystem settings 525 512 * @hw_queues_lock_mutex: used by simulator instead of hw_queues_lock. 513 + * @queues_test_info: information used by the driver when testing the HW queues. 526 514 */ 527 515 struct gaudi2_device { 528 516 int (*cpucp_info_get)(struct hl_device *hdev); ··· 551 537 u32 events_stat[GAUDI2_EVENT_SIZE]; 552 538 u32 events_stat_aggregate[GAUDI2_EVENT_SIZE]; 553 539 u32 num_of_valid_hw_events; 540 + 541 + /* Queue testing */ 542 + struct gaudi2_queues_test_info queues_test_info[GAUDI2_NUM_TESTED_QS]; 554 543 }; 555 544 556 545 /*
+1
drivers/accel/habanalabs/goya/goya.c
··· 473 473 474 474 prop->first_available_user_interrupt = USHRT_MAX; 475 475 prop->tpc_interrupt_id = USHRT_MAX; 476 + prop->eq_interrupt_id = GOYA_EVENT_QUEUE_MSIX_IDX; 476 477 477 478 for (i = 0 ; i < HL_MAX_DCORES ; i++) 478 479 prop->first_available_cq[i] = USHRT_MAX;
+3 -1
drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h
··· 1 1 /* SPDX-License-Identifier: GPL-2.0 2 2 * 3 - * Copyright 2020-2022 HabanaLabs, Ltd. 3 + * Copyright 2020-2023 HabanaLabs, Ltd. 4 4 * All Rights Reserved. 5 5 * 6 6 */ ··· 542 542 #define HBM_MC_SPI_THR_DIS_ENG_MASK BIT(2) 543 543 #define HBM_MC_SPI_IEEE1500_COMP_MASK BIT(3) 544 544 #define HBM_MC_SPI_IEEE1500_PAUSED_MASK BIT(4) 545 + 546 + #define ARC_FARM_OFFSET (mmARC_FARM_ARC1_AUX_BASE - mmARC_FARM_ARC0_AUX_BASE) 545 547 546 548 #include "nic0_qpc0_regs.h" 547 549 #include "nic0_qm0_regs.h"
+2 -1
include/uapi/drm/habanalabs_accel.h
··· 708 708 HL_SERVER_GAUDI_HLS1H = 2, 709 709 HL_SERVER_GAUDI_TYPE1 = 3, 710 710 HL_SERVER_GAUDI_TYPE2 = 4, 711 - HL_SERVER_GAUDI2_HLS2 = 5 711 + HL_SERVER_GAUDI2_HLS2 = 5, 712 + HL_SERVER_GAUDI2_TYPE1 = 7 712 713 }; 713 714 714 715 /*