Merge tag 'char-misc-5.15-rc1-2' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc

+19

Documentation/ABI/testing/debugfs-driver-habanalabs

··· 215 215 "0" means device will be reset in case some CS has timed out, 216 216 otherwise it will not be reset. 217 217 218 + What: /sys/kernel/debug/habanalabs/hl<n>/state_dump 219 + Date: Oct 2021 220 + KernelVersion: 5.15 221 + Contact: ynudelman@habana.ai 222 + Description: Gets the state dump occurring on a CS timeout or failure. 223 + State dump is used for debug and is created each time in case of 224 + a problem in a CS execution, before reset. 225 + Reading from the node returns the newest state dump available. 226 + Writing an integer X discards X state dumps, so that the 227 + next read would return X+1-st newest state dump. 228 + 218 229 What: /sys/kernel/debug/habanalabs/hl<n>/stop_on_err 219 230 Date: Mar 2020 220 231 KernelVersion: 5.6 ··· 240 229 Description: Displays a list with information about the currently user 241 230 pointers (user virtual addresses) that are pinned and mapped 242 231 to DMA addresses 232 + 233 + What: /sys/kernel/debug/habanalabs/hl<n>/userptr_lookup 234 + Date: Aug 2021 235 + KernelVersion: 5.15 236 + Contact: ogabbay@kernel.org 237 + Description: Allows to search for specific user pointers (user virtual 238 + addresses) that are pinned and mapped to DMA addresses, and see 239 + their resolution to the specific dma address. 243 240 244 241 What: /sys/kernel/debug/habanalabs/hl<n>/vm 245 242 Date: Jan 2019

+2 -1

drivers/misc/habanalabs/common/Makefile

··· 10 10 common/asid.o common/habanalabs_ioctl.o \ 11 11 common/command_buffer.o common/hw_queue.o common/irq.o \ 12 12 common/sysfs.o common/hwmon.o common/memory.o \ 13 - common/command_submission.o common/firmware_if.o 13 + common/command_submission.o common/firmware_if.o \ 14 + common/state_dump.o

+1 -3

drivers/misc/habanalabs/common/command_buffer.c

··· 314 314 315 315 spin_lock(&mgr->cb_lock); 316 316 rc = idr_alloc(&mgr->cb_handles, cb, 1, 0, GFP_ATOMIC); 317 - if (rc < 0) 318 - rc = idr_alloc(&mgr->cb_handles, cb, 1, 0, GFP_KERNEL); 319 317 spin_unlock(&mgr->cb_lock); 320 318 321 319 if (rc < 0) { ··· 550 552 551 553 vma->vm_private_data = cb; 552 554 553 - rc = hdev->asic_funcs->cb_mmap(hdev, vma, cb->kernel_address, 555 + rc = hdev->asic_funcs->mmap(hdev, vma, cb->kernel_address, 554 556 cb->bus_address, cb->size); 555 557 if (rc) { 556 558 spin_lock(&cb->lock);

+1009 -298

drivers/misc/habanalabs/common/command_submission.c

··· 38 38 kref); 39 39 struct hl_device *hdev = hw_sob->hdev; 40 40 41 + dev_dbg(hdev->dev, "reset sob id %u\n", hw_sob->sob_id); 42 + 41 43 hdev->asic_funcs->reset_sob(hdev, hw_sob); 44 + 45 + hw_sob->need_reset = false; 42 46 } 43 47 44 48 void hl_sob_reset_error(struct kref *ref) ··· 54 50 dev_crit(hdev->dev, 55 51 "SOB release shouldn't be called here, q_idx: %d, sob_id: %d\n", 56 52 hw_sob->q_idx, hw_sob->sob_id); 53 + } 54 + 55 + void hw_sob_put(struct hl_hw_sob *hw_sob) 56 + { 57 + if (hw_sob) 58 + kref_put(&hw_sob->kref, hl_sob_reset); 59 + } 60 + 61 + static void hw_sob_put_err(struct hl_hw_sob *hw_sob) 62 + { 63 + if (hw_sob) 64 + kref_put(&hw_sob->kref, hl_sob_reset_error); 65 + } 66 + 67 + void hw_sob_get(struct hl_hw_sob *hw_sob) 68 + { 69 + if (hw_sob) 70 + kref_get(&hw_sob->kref); 57 71 } 58 72 59 73 /** ··· 106 84 return 0; 107 85 } 108 86 109 - static void sob_reset_work(struct work_struct *work) 110 - { 111 - struct hl_cs_compl *hl_cs_cmpl = 112 - container_of(work, struct hl_cs_compl, sob_reset_work); 113 - struct hl_device *hdev = hl_cs_cmpl->hdev; 114 - 115 - /* 116 - * A signal CS can get completion while the corresponding wait 117 - * for signal CS is on its way to the PQ. The wait for signal CS 118 - * will get stuck if the signal CS incremented the SOB to its 119 - * max value and there are no pending (submitted) waits on this 120 - * SOB. 121 - * We do the following to void this situation: 122 - * 1. The wait for signal CS must get a ref for the signal CS as 123 - * soon as possible in cs_ioctl_signal_wait() and put it 124 - * before being submitted to the PQ but after it incremented 125 - * the SOB refcnt in init_signal_wait_cs(). 126 - * 2. Signal/Wait for signal CS will decrement the SOB refcnt 127 - * here. 128 - * These two measures guarantee that the wait for signal CS will 129 - * reset the SOB upon completion rather than the signal CS and 130 - * hence the above scenario is avoided. 131 - */ 132 - kref_put(&hl_cs_cmpl->hw_sob->kref, hl_sob_reset); 133 - 134 - if (hl_cs_cmpl->type == CS_TYPE_COLLECTIVE_WAIT) 135 - hdev->asic_funcs->reset_sob_group(hdev, 136 - hl_cs_cmpl->sob_group); 137 - 138 - kfree(hl_cs_cmpl); 139 - } 140 - 141 87 static void hl_fence_release(struct kref *kref) 142 88 { 143 89 struct hl_fence *fence = 144 90 container_of(kref, struct hl_fence, refcount); 145 91 struct hl_cs_compl *hl_cs_cmpl = 146 92 container_of(fence, struct hl_cs_compl, base_fence); 147 - struct hl_device *hdev = hl_cs_cmpl->hdev; 148 93 149 - /* EBUSY means the CS was never submitted and hence we don't have 150 - * an attached hw_sob object that we should handle here 151 - */ 152 - if (fence->error == -EBUSY) 153 - goto free; 154 - 155 - if ((hl_cs_cmpl->type == CS_TYPE_SIGNAL) || 156 - (hl_cs_cmpl->type == CS_TYPE_WAIT) || 157 - (hl_cs_cmpl->type == CS_TYPE_COLLECTIVE_WAIT)) { 158 - 159 - dev_dbg(hdev->dev, 160 - "CS 0x%llx type %d finished, sob_id: %d, sob_val: 0x%x\n", 161 - hl_cs_cmpl->cs_seq, 162 - hl_cs_cmpl->type, 163 - hl_cs_cmpl->hw_sob->sob_id, 164 - hl_cs_cmpl->sob_val); 165 - 166 - queue_work(hdev->sob_reset_wq, &hl_cs_cmpl->sob_reset_work); 167 - 168 - return; 169 - } 170 - 171 - free: 172 94 kfree(hl_cs_cmpl); 173 95 } 174 96 175 97 void hl_fence_put(struct hl_fence *fence) 176 98 { 177 - if (fence) 178 - kref_put(&fence->refcount, hl_fence_release); 99 + if (IS_ERR_OR_NULL(fence)) 100 + return; 101 + kref_put(&fence->refcount, hl_fence_release); 102 + } 103 + 104 + void hl_fences_put(struct hl_fence **fence, int len) 105 + { 106 + int i; 107 + 108 + for (i = 0; i < len; i++, fence++) 109 + hl_fence_put(*fence); 179 110 } 180 111 181 112 void hl_fence_get(struct hl_fence *fence) ··· 448 473 spin_unlock(&hdev->cs_mirror_lock); 449 474 } 450 475 476 + /* 477 + * force_complete_multi_cs - complete all contexts that wait on multi-CS 478 + * 479 + * @hdev: pointer to habanalabs device structure 480 + */ 481 + static void force_complete_multi_cs(struct hl_device *hdev) 482 + { 483 + int i; 484 + 485 + for (i = 0; i < MULTI_CS_MAX_USER_CTX; i++) { 486 + struct multi_cs_completion *mcs_compl; 487 + 488 + mcs_compl = &hdev->multi_cs_completion[i]; 489 + 490 + spin_lock(&mcs_compl->lock); 491 + 492 + if (!mcs_compl->used) { 493 + spin_unlock(&mcs_compl->lock); 494 + continue; 495 + } 496 + 497 + /* when calling force complete no context should be waiting on 498 + * multi-cS. 499 + * We are calling the function as a protection for such case 500 + * to free any pending context and print error message 501 + */ 502 + dev_err(hdev->dev, 503 + "multi-CS completion context %d still waiting when calling force completion\n", 504 + i); 505 + complete_all(&mcs_compl->completion); 506 + spin_unlock(&mcs_compl->lock); 507 + } 508 + } 509 + 510 + /* 511 + * complete_multi_cs - complete all waiting entities on multi-CS 512 + * 513 + * @hdev: pointer to habanalabs device structure 514 + * @cs: CS structure 515 + * The function signals a waiting entity that has an overlapping stream masters 516 + * with the completed CS. 517 + * For example: 518 + * - a completed CS worked on stream master QID 4, multi CS completion 519 + * is actively waiting on stream master QIDs 3, 5. don't send signal as no 520 + * common stream master QID 521 + * - a completed CS worked on stream master QID 4, multi CS completion 522 + * is actively waiting on stream master QIDs 3, 4. send signal as stream 523 + * master QID 4 is common 524 + */ 525 + static void complete_multi_cs(struct hl_device *hdev, struct hl_cs *cs) 526 + { 527 + struct hl_fence *fence = cs->fence; 528 + int i; 529 + 530 + /* in case of multi CS check for completion only for the first CS */ 531 + if (cs->staged_cs && !cs->staged_first) 532 + return; 533 + 534 + for (i = 0; i < MULTI_CS_MAX_USER_CTX; i++) { 535 + struct multi_cs_completion *mcs_compl; 536 + 537 + mcs_compl = &hdev->multi_cs_completion[i]; 538 + if (!mcs_compl->used) 539 + continue; 540 + 541 + spin_lock(&mcs_compl->lock); 542 + 543 + /* 544 + * complete if: 545 + * 1. still waiting for completion 546 + * 2. the completed CS has at least one overlapping stream 547 + * master with the stream masters in the completion 548 + */ 549 + if (mcs_compl->used && 550 + (fence->stream_master_qid_map & 551 + mcs_compl->stream_master_qid_map)) { 552 + /* extract the timestamp only of first completed CS */ 553 + if (!mcs_compl->timestamp) 554 + mcs_compl->timestamp = 555 + ktime_to_ns(fence->timestamp); 556 + complete_all(&mcs_compl->completion); 557 + } 558 + 559 + spin_unlock(&mcs_compl->lock); 560 + } 561 + } 562 + 563 + static inline void cs_release_sob_reset_handler(struct hl_device *hdev, 564 + struct hl_cs *cs, 565 + struct hl_cs_compl *hl_cs_cmpl) 566 + { 567 + /* Skip this handler if the cs wasn't submitted, to avoid putting 568 + * the hw_sob twice, since this case already handled at this point, 569 + * also skip if the hw_sob pointer wasn't set. 570 + */ 571 + if (!hl_cs_cmpl->hw_sob || !cs->submitted) 572 + return; 573 + 574 + spin_lock(&hl_cs_cmpl->lock); 575 + 576 + /* 577 + * we get refcount upon reservation of signals or signal/wait cs for the 578 + * hw_sob object, and need to put it when the first staged cs 579 + * (which cotains the encaps signals) or cs signal/wait is completed. 580 + */ 581 + if ((hl_cs_cmpl->type == CS_TYPE_SIGNAL) || 582 + (hl_cs_cmpl->type == CS_TYPE_WAIT) || 583 + (hl_cs_cmpl->type == CS_TYPE_COLLECTIVE_WAIT) || 584 + (!!hl_cs_cmpl->encaps_signals)) { 585 + dev_dbg(hdev->dev, 586 + "CS 0x%llx type %d finished, sob_id: %d, sob_val: %u\n", 587 + hl_cs_cmpl->cs_seq, 588 + hl_cs_cmpl->type, 589 + hl_cs_cmpl->hw_sob->sob_id, 590 + hl_cs_cmpl->sob_val); 591 + 592 + hw_sob_put(hl_cs_cmpl->hw_sob); 593 + 594 + if (hl_cs_cmpl->type == CS_TYPE_COLLECTIVE_WAIT) 595 + hdev->asic_funcs->reset_sob_group(hdev, 596 + hl_cs_cmpl->sob_group); 597 + } 598 + 599 + spin_unlock(&hl_cs_cmpl->lock); 600 + } 601 + 451 602 static void cs_do_release(struct kref *ref) 452 603 { 453 604 struct hl_cs *cs = container_of(ref, struct hl_cs, refcount); 454 605 struct hl_device *hdev = cs->ctx->hdev; 455 606 struct hl_cs_job *job, *tmp; 607 + struct hl_cs_compl *hl_cs_cmpl = 608 + container_of(cs->fence, struct hl_cs_compl, base_fence); 456 609 457 610 cs->completed = true; 458 611 ··· 596 493 complete_job(hdev, job); 597 494 598 495 if (!cs->submitted) { 599 - /* In case the wait for signal CS was submitted, the put occurs 600 - * in init_signal_wait_cs() or collective_wait_init_cs() 496 + /* 497 + * In case the wait for signal CS was submitted, the fence put 498 + * occurs in init_signal_wait_cs() or collective_wait_init_cs() 601 499 * right before hanging on the PQ. 602 500 */ 603 501 if (cs->type == CS_TYPE_WAIT || ··· 639 535 list_del(&cs->staged_cs_node); 640 536 spin_unlock(&hdev->cs_mirror_lock); 641 537 } 538 + 539 + /* decrement refcount to handle when first staged cs 540 + * with encaps signals is completed. 541 + */ 542 + if (hl_cs_cmpl->encaps_signals) 543 + kref_put(&hl_cs_cmpl->encaps_sig_hdl->refcount, 544 + hl_encaps_handle_do_release); 642 545 } 546 + 547 + if ((cs->type == CS_TYPE_WAIT || cs->type == CS_TYPE_COLLECTIVE_WAIT) 548 + && cs->encaps_signals) 549 + kref_put(&cs->encaps_sig_hdl->refcount, 550 + hl_encaps_handle_do_release); 643 551 644 552 out: 645 553 /* Must be called before hl_ctx_put because inside we use ctx to get ··· 682 566 if (cs->timestamp) 683 567 cs->fence->timestamp = ktime_get(); 684 568 complete_all(&cs->fence->completion); 569 + complete_multi_cs(hdev, cs); 570 + 571 + cs_release_sob_reset_handler(hdev, cs, hl_cs_cmpl); 572 + 685 573 hl_fence_put(cs->fence); 686 574 687 575 kfree(cs->jobs_in_queue_cnt); ··· 741 621 break; 742 622 } 743 623 624 + rc = hl_state_dump(hdev); 625 + if (rc) 626 + dev_err(hdev->dev, "Error during system state dump %d\n", rc); 627 + 744 628 cs_put(cs); 745 629 746 630 if (likely(!skip_reset_on_timeout)) { ··· 785 661 cs->completed = false; 786 662 cs->type = cs_type; 787 663 cs->timestamp = !!(flags & HL_CS_FLAGS_TIMESTAMP); 664 + cs->encaps_signals = !!(flags & HL_CS_FLAGS_ENCAP_SIGNALS); 788 665 cs->timeout_jiffies = timeout; 789 666 cs->skip_reset_on_timeout = 790 667 hdev->skip_reset_on_timeout || ··· 796 671 kref_init(&cs->refcount); 797 672 spin_lock_init(&cs->job_lock); 798 673 799 - cs_cmpl = kmalloc(sizeof(*cs_cmpl), GFP_ATOMIC); 674 + cs_cmpl = kzalloc(sizeof(*cs_cmpl), GFP_ATOMIC); 800 675 if (!cs_cmpl) 801 - cs_cmpl = kmalloc(sizeof(*cs_cmpl), GFP_KERNEL); 676 + cs_cmpl = kzalloc(sizeof(*cs_cmpl), GFP_KERNEL); 802 677 803 678 if (!cs_cmpl) { 804 679 atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt); ··· 823 698 cs_cmpl->hdev = hdev; 824 699 cs_cmpl->type = cs->type; 825 700 spin_lock_init(&cs_cmpl->lock); 826 - INIT_WORK(&cs_cmpl->sob_reset_work, sob_reset_work); 827 701 cs->fence = &cs_cmpl->base_fence; 828 702 829 703 spin_lock(&ctx->cs_lock); ··· 915 791 cs_rollback(hdev, cs); 916 792 cs_put(cs); 917 793 } 918 - } 919 794 920 - void hl_pending_cb_list_flush(struct hl_ctx *ctx) 921 - { 922 - struct hl_pending_cb *pending_cb, *tmp; 923 - 924 - list_for_each_entry_safe(pending_cb, tmp, 925 - &ctx->pending_cb_list, cb_node) { 926 - list_del(&pending_cb->cb_node); 927 - hl_cb_put(pending_cb->cb); 928 - kfree(pending_cb); 929 - } 795 + force_complete_multi_cs(hdev); 930 796 } 931 797 932 798 static void 933 799 wake_pending_user_interrupt_threads(struct hl_user_interrupt *interrupt) 934 800 { 935 801 struct hl_user_pending_interrupt *pend; 802 + unsigned long flags; 936 803 937 - spin_lock(&interrupt->wait_list_lock); 804 + spin_lock_irqsave(&interrupt->wait_list_lock, flags); 938 805 list_for_each_entry(pend, &interrupt->wait_list_head, wait_list_node) { 939 806 pend->fence.error = -EIO; 940 807 complete_all(&pend->fence.completion); 941 808 } 942 - spin_unlock(&interrupt->wait_list_lock); 809 + spin_unlock_irqrestore(&interrupt->wait_list_lock, flags); 943 810 } 944 811 945 812 void hl_release_pending_user_interrupts(struct hl_device *hdev) ··· 1096 981 return CS_TYPE_WAIT; 1097 982 else if (cs_type_flags & HL_CS_FLAGS_COLLECTIVE_WAIT) 1098 983 return CS_TYPE_COLLECTIVE_WAIT; 984 + else if (cs_type_flags & HL_CS_FLAGS_RESERVE_SIGNALS_ONLY) 985 + return CS_RESERVE_SIGNALS; 986 + else if (cs_type_flags & HL_CS_FLAGS_UNRESERVE_SIGNALS_ONLY) 987 + return CS_UNRESERVE_SIGNALS; 1099 988 else 1100 989 return CS_TYPE_DEFAULT; 1101 990 } ··· 1200 1081 } 1201 1082 1202 1083 static int cs_staged_submission(struct hl_device *hdev, struct hl_cs *cs, 1203 - u64 sequence, u32 flags) 1084 + u64 sequence, u32 flags, 1085 + u32 encaps_signal_handle) 1204 1086 { 1205 1087 if (!(flags & HL_CS_FLAGS_STAGED_SUBMISSION)) 1206 1088 return 0; ··· 1213 1093 /* Staged CS sequence is the first CS sequence */ 1214 1094 INIT_LIST_HEAD(&cs->staged_cs_node); 1215 1095 cs->staged_sequence = cs->sequence; 1096 + 1097 + if (cs->encaps_signals) 1098 + cs->encaps_sig_hdl_id = encaps_signal_handle; 1216 1099 } else { 1217 1100 /* User sequence will be validated in 'hl_hw_queue_schedule_cs' 1218 1101 * under the cs_mirror_lock ··· 1231 1108 return 0; 1232 1109 } 1233 1110 1111 + static u32 get_stream_master_qid_mask(struct hl_device *hdev, u32 qid) 1112 + { 1113 + int i; 1114 + 1115 + for (i = 0; i < hdev->stream_master_qid_arr_size; i++) 1116 + if (qid == hdev->stream_master_qid_arr[i]) 1117 + return BIT(i); 1118 + 1119 + return 0; 1120 + } 1121 + 1234 1122 static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks, 1235 1123 u32 num_chunks, u64 *cs_seq, u32 flags, 1236 - u32 timeout) 1124 + u32 encaps_signals_handle, u32 timeout) 1237 1125 { 1238 1126 bool staged_mid, int_queues_only = true; 1239 1127 struct hl_device *hdev = hpriv->hdev; ··· 1255 1121 struct hl_cs *cs; 1256 1122 struct hl_cb *cb; 1257 1123 u64 user_sequence; 1124 + u8 stream_master_qid_map = 0; 1258 1125 int rc, i; 1259 1126 1260 1127 cntr = &hdev->aggregated_cs_counters; ··· 1283 1148 1284 1149 hl_debugfs_add_cs(cs); 1285 1150 1286 - rc = cs_staged_submission(hdev, cs, user_sequence, flags); 1151 + rc = cs_staged_submission(hdev, cs, user_sequence, flags, 1152 + encaps_signals_handle); 1287 1153 if (rc) 1288 1154 goto free_cs_object; 1289 1155 ··· 1315 1179 cb = (struct hl_cb *) (uintptr_t) chunk->cb_handle; 1316 1180 } 1317 1181 1318 - if (queue_type == QUEUE_TYPE_EXT || queue_type == QUEUE_TYPE_HW) 1182 + if (queue_type == QUEUE_TYPE_EXT || 1183 + queue_type == QUEUE_TYPE_HW) { 1319 1184 int_queues_only = false; 1185 + 1186 + /* 1187 + * store which stream are being used for external/HW 1188 + * queues of this CS 1189 + */ 1190 + if (hdev->supports_wait_for_multi_cs) 1191 + stream_master_qid_map |= 1192 + get_stream_master_qid_mask(hdev, 1193 + chunk->queue_index); 1194 + } 1320 1195 1321 1196 job = hl_cs_allocate_job(hdev, queue_type, 1322 1197 is_kernel_allocated_cb); ··· 1389 1242 goto free_cs_object; 1390 1243 } 1391 1244 1245 + /* 1246 + * store the (external/HW queues) streams used by the CS in the 1247 + * fence object for multi-CS completion 1248 + */ 1249 + if (hdev->supports_wait_for_multi_cs) 1250 + cs->fence->stream_master_qid_map = stream_master_qid_map; 1251 + 1392 1252 rc = hl_hw_queue_schedule_cs(cs); 1393 1253 if (rc) { 1394 1254 if (rc != -EAGAIN) ··· 1421 1267 free_cs_chunk_array: 1422 1268 kfree(cs_chunk_array); 1423 1269 out: 1424 - return rc; 1425 - } 1426 - 1427 - static int pending_cb_create_job(struct hl_device *hdev, struct hl_ctx *ctx, 1428 - struct hl_cs *cs, struct hl_cb *cb, u32 size, u32 hw_queue_id) 1429 - { 1430 - struct hw_queue_properties *hw_queue_prop; 1431 - struct hl_cs_counters_atomic *cntr; 1432 - struct hl_cs_job *job; 1433 - 1434 - hw_queue_prop = &hdev->asic_prop.hw_queues_props[hw_queue_id]; 1435 - cntr = &hdev->aggregated_cs_counters; 1436 - 1437 - job = hl_cs_allocate_job(hdev, hw_queue_prop->type, true); 1438 - if (!job) { 1439 - atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt); 1440 - atomic64_inc(&cntr->out_of_mem_drop_cnt); 1441 - dev_err(hdev->dev, "Failed to allocate a new job\n"); 1442 - return -ENOMEM; 1443 - } 1444 - 1445 - job->id = 0; 1446 - job->cs = cs; 1447 - job->user_cb = cb; 1448 - atomic_inc(&job->user_cb->cs_cnt); 1449 - job->user_cb_size = size; 1450 - job->hw_queue_id = hw_queue_id; 1451 - job->patched_cb = job->user_cb; 1452 - job->job_cb_size = job->user_cb_size; 1453 - 1454 - /* increment refcount as for external queues we get completion */ 1455 - cs_get(cs); 1456 - 1457 - cs->jobs_in_queue_cnt[job->hw_queue_id]++; 1458 - 1459 - list_add_tail(&job->cs_node, &cs->job_list); 1460 - 1461 - hl_debugfs_add_job(hdev, job); 1462 - 1463 - return 0; 1464 - } 1465 - 1466 - static int hl_submit_pending_cb(struct hl_fpriv *hpriv) 1467 - { 1468 - struct hl_device *hdev = hpriv->hdev; 1469 - struct hl_ctx *ctx = hpriv->ctx; 1470 - struct hl_pending_cb *pending_cb, *tmp; 1471 - struct list_head local_cb_list; 1472 - struct hl_cs *cs; 1473 - struct hl_cb *cb; 1474 - u32 hw_queue_id; 1475 - u32 cb_size; 1476 - int process_list, rc = 0; 1477 - 1478 - if (list_empty(&ctx->pending_cb_list)) 1479 - return 0; 1480 - 1481 - process_list = atomic_cmpxchg(&ctx->thread_pending_cb_token, 1, 0); 1482 - 1483 - /* Only a single thread is allowed to process the list */ 1484 - if (!process_list) 1485 - return 0; 1486 - 1487 - if (list_empty(&ctx->pending_cb_list)) 1488 - goto free_pending_cb_token; 1489 - 1490 - /* move all list elements to a local list */ 1491 - INIT_LIST_HEAD(&local_cb_list); 1492 - spin_lock(&ctx->pending_cb_lock); 1493 - list_for_each_entry_safe(pending_cb, tmp, &ctx->pending_cb_list, 1494 - cb_node) 1495 - list_move_tail(&pending_cb->cb_node, &local_cb_list); 1496 - spin_unlock(&ctx->pending_cb_lock); 1497 - 1498 - rc = allocate_cs(hdev, ctx, CS_TYPE_DEFAULT, ULLONG_MAX, &cs, 0, 1499 - hdev->timeout_jiffies); 1500 - if (rc) 1501 - goto add_list_elements; 1502 - 1503 - hl_debugfs_add_cs(cs); 1504 - 1505 - /* Iterate through pending cb list, create jobs and add to CS */ 1506 - list_for_each_entry(pending_cb, &local_cb_list, cb_node) { 1507 - cb = pending_cb->cb; 1508 - cb_size = pending_cb->cb_size; 1509 - hw_queue_id = pending_cb->hw_queue_id; 1510 - 1511 - rc = pending_cb_create_job(hdev, ctx, cs, cb, cb_size, 1512 - hw_queue_id); 1513 - if (rc) 1514 - goto free_cs_object; 1515 - } 1516 - 1517 - rc = hl_hw_queue_schedule_cs(cs); 1518 - if (rc) { 1519 - if (rc != -EAGAIN) 1520 - dev_err(hdev->dev, 1521 - "Failed to submit CS %d.%llu (%d)\n", 1522 - ctx->asid, cs->sequence, rc); 1523 - goto free_cs_object; 1524 - } 1525 - 1526 - /* pending cb was scheduled successfully */ 1527 - list_for_each_entry_safe(pending_cb, tmp, &local_cb_list, cb_node) { 1528 - list_del(&pending_cb->cb_node); 1529 - kfree(pending_cb); 1530 - } 1531 - 1532 - cs_put(cs); 1533 - 1534 - goto free_pending_cb_token; 1535 - 1536 - free_cs_object: 1537 - cs_rollback(hdev, cs); 1538 - cs_put(cs); 1539 - add_list_elements: 1540 - spin_lock(&ctx->pending_cb_lock); 1541 - list_for_each_entry_safe_reverse(pending_cb, tmp, &local_cb_list, 1542 - cb_node) 1543 - list_move(&pending_cb->cb_node, &ctx->pending_cb_list); 1544 - spin_unlock(&ctx->pending_cb_lock); 1545 - free_pending_cb_token: 1546 - atomic_set(&ctx->thread_pending_cb_token, 1); 1547 - 1548 1270 return rc; 1549 1271 } 1550 1272 ··· 1473 1443 rc = 0; 1474 1444 } else { 1475 1445 rc = cs_ioctl_default(hpriv, chunks, num_chunks, 1476 - cs_seq, 0, hdev->timeout_jiffies); 1446 + cs_seq, 0, 0, hdev->timeout_jiffies); 1477 1447 } 1478 1448 1479 1449 mutex_unlock(&hpriv->restore_phase_mutex); ··· 1531 1501 * hl_cs_signal_sob_wraparound_handler: handle SOB value wrapaound case. 1532 1502 * if the SOB value reaches the max value move to the other SOB reserved 1533 1503 * to the queue. 1504 + * @hdev: pointer to device structure 1505 + * @q_idx: stream queue index 1506 + * @hw_sob: the H/W SOB used in this signal CS. 1507 + * @count: signals count 1508 + * @encaps_sig: tells whether it's reservation for encaps signals or not. 1509 + * 1534 1510 * Note that this function must be called while hw_queues_lock is taken. 1535 1511 */ 1536 1512 int hl_cs_signal_sob_wraparound_handler(struct hl_device *hdev, u32 q_idx, 1537 - struct hl_hw_sob **hw_sob, u32 count) 1513 + struct hl_hw_sob **hw_sob, u32 count, bool encaps_sig) 1514 + 1538 1515 { 1539 1516 struct hl_sync_stream_properties *prop; 1540 1517 struct hl_hw_sob *sob = *hw_sob, *other_sob; ··· 1549 1512 1550 1513 prop = &hdev->kernel_queues[q_idx].sync_stream_prop; 1551 1514 1552 - kref_get(&sob->kref); 1515 + hw_sob_get(sob); 1553 1516 1554 1517 /* check for wraparound */ 1555 1518 if (prop->next_sob_val + count >= HL_MAX_SOB_VAL) { ··· 1559 1522 * just incremented the refcount right before calling this 1560 1523 * function. 1561 1524 */ 1562 - kref_put(&sob->kref, hl_sob_reset_error); 1525 + hw_sob_put_err(sob); 1563 1526 1564 1527 /* 1565 1528 * check the other sob value, if it still in use then fail ··· 1574 1537 return -EINVAL; 1575 1538 } 1576 1539 1577 - prop->next_sob_val = 1; 1540 + /* 1541 + * next_sob_val always points to the next available signal 1542 + * in the sob, so in encaps signals it will be the next one 1543 + * after reserving the required amount. 1544 + */ 1545 + if (encaps_sig) 1546 + prop->next_sob_val = count + 1; 1547 + else 1548 + prop->next_sob_val = count; 1578 1549 1579 1550 /* only two SOBs are currently in use */ 1580 1551 prop->curr_sob_offset = other_sob_offset; 1581 1552 *hw_sob = other_sob; 1553 + 1554 + /* 1555 + * check if other_sob needs reset, then do it before using it 1556 + * for the reservation or the next signal cs. 1557 + * we do it here, and for both encaps and regular signal cs 1558 + * cases in order to avoid possible races of two kref_put 1559 + * of the sob which can occur at the same time if we move the 1560 + * sob reset(kref_put) to cs_do_release function. 1561 + * in addition, if we have combination of cs signal and 1562 + * encaps, and at the point we need to reset the sob there was 1563 + * no more reservations and only signal cs keep coming, 1564 + * in such case we need signal_cs to put the refcount and 1565 + * reset the sob. 1566 + */ 1567 + if (other_sob->need_reset) 1568 + hw_sob_put(other_sob); 1569 + 1570 + if (encaps_sig) { 1571 + /* set reset indication for the sob */ 1572 + sob->need_reset = true; 1573 + hw_sob_get(other_sob); 1574 + } 1582 1575 1583 1576 dev_dbg(hdev->dev, "switched to SOB %d, q_idx: %d\n", 1584 1577 prop->curr_sob_offset, q_idx); ··· 1620 1553 } 1621 1554 1622 1555 static int cs_ioctl_extract_signal_seq(struct hl_device *hdev, 1623 - struct hl_cs_chunk *chunk, u64 *signal_seq, struct hl_ctx *ctx) 1556 + struct hl_cs_chunk *chunk, u64 *signal_seq, struct hl_ctx *ctx, 1557 + bool encaps_signals) 1624 1558 { 1625 1559 u64 *signal_seq_arr = NULL; 1626 1560 u32 size_to_copy, signal_seq_arr_len; 1627 1561 int rc = 0; 1562 + 1563 + if (encaps_signals) { 1564 + *signal_seq = chunk->encaps_signal_seq; 1565 + return 0; 1566 + } 1628 1567 1629 1568 signal_seq_arr_len = chunk->num_signal_seq_arr; 1630 1569 ··· 1656 1583 return -ENOMEM; 1657 1584 } 1658 1585 1659 - size_to_copy = chunk->num_signal_seq_arr * sizeof(*signal_seq_arr); 1586 + size_to_copy = signal_seq_arr_len * sizeof(*signal_seq_arr); 1660 1587 if (copy_from_user(signal_seq_arr, 1661 1588 u64_to_user_ptr(chunk->signal_seq_arr), 1662 1589 size_to_copy)) { ··· 1678 1605 } 1679 1606 1680 1607 static int cs_ioctl_signal_wait_create_jobs(struct hl_device *hdev, 1681 - struct hl_ctx *ctx, struct hl_cs *cs, enum hl_queue_type q_type, 1682 - u32 q_idx) 1608 + struct hl_ctx *ctx, struct hl_cs *cs, 1609 + enum hl_queue_type q_type, u32 q_idx, u32 encaps_signal_offset) 1683 1610 { 1684 1611 struct hl_cs_counters_atomic *cntr; 1685 1612 struct hl_cs_job *job; ··· 1717 1644 job->user_cb_size = cb_size; 1718 1645 job->hw_queue_id = q_idx; 1719 1646 1647 + if ((cs->type == CS_TYPE_WAIT || cs->type == CS_TYPE_COLLECTIVE_WAIT) 1648 + && cs->encaps_signals) 1649 + job->encaps_sig_wait_offset = encaps_signal_offset; 1720 1650 /* 1721 1651 * No need in parsing, user CB is the patched CB. 1722 1652 * We call hl_cb_destroy() out of two reasons - we don't need the CB in ··· 1742 1666 return 0; 1743 1667 } 1744 1668 1669 + static int cs_ioctl_reserve_signals(struct hl_fpriv *hpriv, 1670 + u32 q_idx, u32 count, 1671 + u32 *handle_id, u32 *sob_addr, 1672 + u32 *signals_count) 1673 + { 1674 + struct hw_queue_properties *hw_queue_prop; 1675 + struct hl_sync_stream_properties *prop; 1676 + struct hl_device *hdev = hpriv->hdev; 1677 + struct hl_cs_encaps_sig_handle *handle; 1678 + struct hl_encaps_signals_mgr *mgr; 1679 + struct hl_hw_sob *hw_sob; 1680 + int hdl_id; 1681 + int rc = 0; 1682 + 1683 + if (count >= HL_MAX_SOB_VAL) { 1684 + dev_err(hdev->dev, "signals count(%u) exceeds the max SOB value\n", 1685 + count); 1686 + rc = -EINVAL; 1687 + goto out; 1688 + } 1689 + 1690 + if (q_idx >= hdev->asic_prop.max_queues) { 1691 + dev_err(hdev->dev, "Queue index %d is invalid\n", 1692 + q_idx); 1693 + rc = -EINVAL; 1694 + goto out; 1695 + } 1696 + 1697 + hw_queue_prop = &hdev->asic_prop.hw_queues_props[q_idx]; 1698 + 1699 + if (!hw_queue_prop->supports_sync_stream) { 1700 + dev_err(hdev->dev, 1701 + "Queue index %d does not support sync stream operations\n", 1702 + q_idx); 1703 + rc = -EINVAL; 1704 + goto out; 1705 + } 1706 + 1707 + prop = &hdev->kernel_queues[q_idx].sync_stream_prop; 1708 + 1709 + handle = kzalloc(sizeof(*handle), GFP_KERNEL); 1710 + if (!handle) { 1711 + rc = -ENOMEM; 1712 + goto out; 1713 + } 1714 + 1715 + handle->count = count; 1716 + mgr = &hpriv->ctx->sig_mgr; 1717 + 1718 + spin_lock(&mgr->lock); 1719 + hdl_id = idr_alloc(&mgr->handles, handle, 1, 0, GFP_ATOMIC); 1720 + spin_unlock(&mgr->lock); 1721 + 1722 + if (hdl_id < 0) { 1723 + dev_err(hdev->dev, "Failed to allocate IDR for a new signal reservation\n"); 1724 + rc = -EINVAL; 1725 + goto out; 1726 + } 1727 + 1728 + handle->id = hdl_id; 1729 + handle->q_idx = q_idx; 1730 + handle->hdev = hdev; 1731 + kref_init(&handle->refcount); 1732 + 1733 + hdev->asic_funcs->hw_queues_lock(hdev); 1734 + 1735 + hw_sob = &prop->hw_sob[prop->curr_sob_offset]; 1736 + 1737 + /* 1738 + * Increment the SOB value by count by user request 1739 + * to reserve those signals 1740 + * check if the signals amount to reserve is not exceeding the max sob 1741 + * value, if yes then switch sob. 1742 + */ 1743 + rc = hl_cs_signal_sob_wraparound_handler(hdev, q_idx, &hw_sob, count, 1744 + true); 1745 + if (rc) { 1746 + dev_err(hdev->dev, "Failed to switch SOB\n"); 1747 + hdev->asic_funcs->hw_queues_unlock(hdev); 1748 + rc = -EINVAL; 1749 + goto remove_idr; 1750 + } 1751 + /* set the hw_sob to the handle after calling the sob wraparound handler 1752 + * since sob could have changed. 1753 + */ 1754 + handle->hw_sob = hw_sob; 1755 + 1756 + /* store the current sob value for unreserve validity check, and 1757 + * signal offset support 1758 + */ 1759 + handle->pre_sob_val = prop->next_sob_val - handle->count; 1760 + 1761 + *signals_count = prop->next_sob_val; 1762 + hdev->asic_funcs->hw_queues_unlock(hdev); 1763 + 1764 + *sob_addr = handle->hw_sob->sob_addr; 1765 + *handle_id = hdl_id; 1766 + 1767 + dev_dbg(hdev->dev, 1768 + "Signals reserved, sob_id: %d, sob addr: 0x%x, last sob_val: %u, q_idx: %d, hdl_id: %d\n", 1769 + hw_sob->sob_id, handle->hw_sob->sob_addr, 1770 + prop->next_sob_val - 1, q_idx, hdl_id); 1771 + goto out; 1772 + 1773 + remove_idr: 1774 + spin_lock(&mgr->lock); 1775 + idr_remove(&mgr->handles, hdl_id); 1776 + spin_unlock(&mgr->lock); 1777 + 1778 + kfree(handle); 1779 + out: 1780 + return rc; 1781 + } 1782 + 1783 + static int cs_ioctl_unreserve_signals(struct hl_fpriv *hpriv, u32 handle_id) 1784 + { 1785 + struct hl_cs_encaps_sig_handle *encaps_sig_hdl; 1786 + struct hl_sync_stream_properties *prop; 1787 + struct hl_device *hdev = hpriv->hdev; 1788 + struct hl_encaps_signals_mgr *mgr; 1789 + struct hl_hw_sob *hw_sob; 1790 + u32 q_idx, sob_addr; 1791 + int rc = 0; 1792 + 1793 + mgr = &hpriv->ctx->sig_mgr; 1794 + 1795 + spin_lock(&mgr->lock); 1796 + encaps_sig_hdl = idr_find(&mgr->handles, handle_id); 1797 + if (encaps_sig_hdl) { 1798 + dev_dbg(hdev->dev, "unreserve signals, handle: %u, SOB:0x%x, count: %u\n", 1799 + handle_id, encaps_sig_hdl->hw_sob->sob_addr, 1800 + encaps_sig_hdl->count); 1801 + 1802 + hdev->asic_funcs->hw_queues_lock(hdev); 1803 + 1804 + q_idx = encaps_sig_hdl->q_idx; 1805 + prop = &hdev->kernel_queues[q_idx].sync_stream_prop; 1806 + hw_sob = &prop->hw_sob[prop->curr_sob_offset]; 1807 + sob_addr = hdev->asic_funcs->get_sob_addr(hdev, hw_sob->sob_id); 1808 + 1809 + /* Check if sob_val got out of sync due to other 1810 + * signal submission requests which were handled 1811 + * between the reserve-unreserve calls or SOB switch 1812 + * upon reaching SOB max value. 1813 + */ 1814 + if (encaps_sig_hdl->pre_sob_val + encaps_sig_hdl->count 1815 + != prop->next_sob_val || 1816 + sob_addr != encaps_sig_hdl->hw_sob->sob_addr) { 1817 + dev_err(hdev->dev, "Cannot unreserve signals, SOB val ran out of sync, expected: %u, actual val: %u\n", 1818 + encaps_sig_hdl->pre_sob_val, 1819 + (prop->next_sob_val - encaps_sig_hdl->count)); 1820 + 1821 + hdev->asic_funcs->hw_queues_unlock(hdev); 1822 + rc = -EINVAL; 1823 + goto out; 1824 + } 1825 + 1826 + /* 1827 + * Decrement the SOB value by count by user request 1828 + * to unreserve those signals 1829 + */ 1830 + prop->next_sob_val -= encaps_sig_hdl->count; 1831 + 1832 + hdev->asic_funcs->hw_queues_unlock(hdev); 1833 + 1834 + hw_sob_put(hw_sob); 1835 + 1836 + /* Release the id and free allocated memory of the handle */ 1837 + idr_remove(&mgr->handles, handle_id); 1838 + kfree(encaps_sig_hdl); 1839 + } else { 1840 + rc = -EINVAL; 1841 + dev_err(hdev->dev, "failed to unreserve signals, cannot find handler\n"); 1842 + } 1843 + out: 1844 + spin_unlock(&mgr->lock); 1845 + 1846 + return rc; 1847 + } 1848 + 1745 1849 static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type, 1746 1850 void __user *chunks, u32 num_chunks, 1747 1851 u64 *cs_seq, u32 flags, u32 timeout) 1748 1852 { 1853 + struct hl_cs_encaps_sig_handle *encaps_sig_hdl = NULL; 1854 + bool handle_found = false, is_wait_cs = false, 1855 + wait_cs_submitted = false, 1856 + cs_encaps_signals = false; 1749 1857 struct hl_cs_chunk *cs_chunk_array, *chunk; 1858 + bool staged_cs_with_encaps_signals = false; 1750 1859 struct hw_queue_properties *hw_queue_prop; 1751 1860 struct hl_device *hdev = hpriv->hdev; 1752 1861 struct hl_cs_compl *sig_waitcs_cmpl; ··· 1991 1730 collective_engine_id = chunk->collective_engine_id; 1992 1731 } 1993 1732 1994 - if (cs_type == CS_TYPE_WAIT || cs_type == CS_TYPE_COLLECTIVE_WAIT) { 1995 - rc = cs_ioctl_extract_signal_seq(hdev, chunk, &signal_seq, ctx); 1733 + is_wait_cs = !!(cs_type == CS_TYPE_WAIT || 1734 + cs_type == CS_TYPE_COLLECTIVE_WAIT); 1735 + 1736 + cs_encaps_signals = !!(flags & HL_CS_FLAGS_ENCAP_SIGNALS); 1737 + 1738 + if (is_wait_cs) { 1739 + rc = cs_ioctl_extract_signal_seq(hdev, chunk, &signal_seq, 1740 + ctx, cs_encaps_signals); 1996 1741 if (rc) 1997 1742 goto free_cs_chunk_array; 1743 + 1744 + if (cs_encaps_signals) { 1745 + /* check if cs sequence has encapsulated 1746 + * signals handle 1747 + */ 1748 + struct idr *idp; 1749 + u32 id; 1750 + 1751 + spin_lock(&ctx->sig_mgr.lock); 1752 + idp = &ctx->sig_mgr.handles; 1753 + idr_for_each_entry(idp, encaps_sig_hdl, id) { 1754 + if (encaps_sig_hdl->cs_seq == signal_seq) { 1755 + handle_found = true; 1756 + /* get refcount to protect removing 1757 + * this handle from idr, needed when 1758 + * multiple wait cs are used with offset 1759 + * to wait on reserved encaps signals. 1760 + */ 1761 + kref_get(&encaps_sig_hdl->refcount); 1762 + break; 1763 + } 1764 + } 1765 + spin_unlock(&ctx->sig_mgr.lock); 1766 + 1767 + if (!handle_found) { 1768 + dev_err(hdev->dev, "Cannot find encapsulated signals handle for seq 0x%llx\n", 1769 + signal_seq); 1770 + rc = -EINVAL; 1771 + goto free_cs_chunk_array; 1772 + } 1773 + 1774 + /* validate also the signal offset value */ 1775 + if (chunk->encaps_signal_offset > 1776 + encaps_sig_hdl->count) { 1777 + dev_err(hdev->dev, "offset(%u) value exceed max reserved signals count(%u)!\n", 1778 + chunk->encaps_signal_offset, 1779 + encaps_sig_hdl->count); 1780 + rc = -EINVAL; 1781 + goto free_cs_chunk_array; 1782 + } 1783 + } 1998 1784 1999 1785 sig_fence = hl_ctx_get_fence(ctx, signal_seq); 2000 1786 if (IS_ERR(sig_fence)) { ··· 2063 1755 sig_waitcs_cmpl = 2064 1756 container_of(sig_fence, struct hl_cs_compl, base_fence); 2065 1757 2066 - if (sig_waitcs_cmpl->type != CS_TYPE_SIGNAL) { 1758 + staged_cs_with_encaps_signals = !! 1759 + (sig_waitcs_cmpl->type == CS_TYPE_DEFAULT && 1760 + (flags & HL_CS_FLAGS_ENCAP_SIGNALS)); 1761 + 1762 + if (sig_waitcs_cmpl->type != CS_TYPE_SIGNAL && 1763 + !staged_cs_with_encaps_signals) { 2067 1764 atomic64_inc(&ctx->cs_counters.validation_drop_cnt); 2068 1765 atomic64_inc(&cntr->validation_drop_cnt); 2069 1766 dev_err(hdev->dev, 2070 - "CS seq 0x%llx is not of a signal CS\n", 1767 + "CS seq 0x%llx is not of a signal/encaps-signal CS\n", 2071 1768 signal_seq); 2072 1769 hl_fence_put(sig_fence); 2073 1770 rc = -EINVAL; ··· 2089 1776 2090 1777 rc = allocate_cs(hdev, ctx, cs_type, ULLONG_MAX, &cs, flags, timeout); 2091 1778 if (rc) { 2092 - if (cs_type == CS_TYPE_WAIT || 2093 - cs_type == CS_TYPE_COLLECTIVE_WAIT) 1779 + if (is_wait_cs) 2094 1780 hl_fence_put(sig_fence); 1781 + 2095 1782 goto free_cs_chunk_array; 2096 1783 } 2097 1784 2098 1785 /* 2099 1786 * Save the signal CS fence for later initialization right before 2100 1787 * hanging the wait CS on the queue. 1788 + * for encaps signals case, we save the cs sequence and handle pointer 1789 + * for later initialization. 2101 1790 */ 2102 - if (cs_type == CS_TYPE_WAIT || cs_type == CS_TYPE_COLLECTIVE_WAIT) 1791 + if (is_wait_cs) { 2103 1792 cs->signal_fence = sig_fence; 1793 + /* store the handle pointer, so we don't have to 1794 + * look for it again, later on the flow 1795 + * when we need to set SOB info in hw_queue. 1796 + */ 1797 + if (cs->encaps_signals) 1798 + cs->encaps_sig_hdl = encaps_sig_hdl; 1799 + } 2104 1800 2105 1801 hl_debugfs_add_cs(cs); 2106 1802 ··· 2117 1795 2118 1796 if (cs_type == CS_TYPE_WAIT || cs_type == CS_TYPE_SIGNAL) 2119 1797 rc = cs_ioctl_signal_wait_create_jobs(hdev, ctx, cs, q_type, 2120 - q_idx); 1798 + q_idx, chunk->encaps_signal_offset); 2121 1799 else if (cs_type == CS_TYPE_COLLECTIVE_WAIT) 2122 1800 rc = hdev->asic_funcs->collective_wait_create_jobs(hdev, ctx, 2123 - cs, q_idx, collective_engine_id); 1801 + cs, q_idx, collective_engine_id, 1802 + chunk->encaps_signal_offset); 2124 1803 else { 2125 1804 atomic64_inc(&ctx->cs_counters.validation_drop_cnt); 2126 1805 atomic64_inc(&cntr->validation_drop_cnt); ··· 2133 1810 2134 1811 rc = hl_hw_queue_schedule_cs(cs); 2135 1812 if (rc) { 2136 - if (rc != -EAGAIN) 1813 + /* In case wait cs failed here, it means the signal cs 1814 + * already completed. we want to free all it's related objects 1815 + * but we don't want to fail the ioctl. 1816 + */ 1817 + if (is_wait_cs) 1818 + rc = 0; 1819 + else if (rc != -EAGAIN) 2137 1820 dev_err(hdev->dev, 2138 1821 "Failed to submit CS %d.%llu to H/W queues, error %d\n", 2139 1822 ctx->asid, cs->sequence, rc); ··· 2147 1818 } 2148 1819 2149 1820 rc = HL_CS_STATUS_SUCCESS; 1821 + if (is_wait_cs) 1822 + wait_cs_submitted = true; 2150 1823 goto put_cs; 2151 1824 2152 1825 free_cs_object: ··· 2159 1828 /* We finished with the CS in this function, so put the ref */ 2160 1829 cs_put(cs); 2161 1830 free_cs_chunk_array: 1831 + if (!wait_cs_submitted && cs_encaps_signals && handle_found && 1832 + is_wait_cs) 1833 + kref_put(&encaps_sig_hdl->refcount, 1834 + hl_encaps_handle_do_release); 2162 1835 kfree(cs_chunk_array); 2163 1836 out: 2164 1837 return rc; ··· 2171 1836 int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data) 2172 1837 { 2173 1838 union hl_cs_args *args = data; 2174 - enum hl_cs_type cs_type; 1839 + enum hl_cs_type cs_type = 0; 2175 1840 u64 cs_seq = ULONG_MAX; 2176 1841 void __user *chunks; 2177 - u32 num_chunks, flags, timeout; 1842 + u32 num_chunks, flags, timeout, 1843 + signals_count = 0, sob_addr = 0, handle_id = 0; 2178 1844 int rc; 2179 1845 2180 1846 rc = hl_cs_sanity_checks(hpriv, args); ··· 2183 1847 goto out; 2184 1848 2185 1849 rc = hl_cs_ctx_switch(hpriv, args, &cs_seq); 2186 - if (rc) 2187 - goto out; 2188 - 2189 - rc = hl_submit_pending_cb(hpriv); 2190 1850 if (rc) 2191 1851 goto out; 2192 1852 ··· 2208 1876 rc = cs_ioctl_signal_wait(hpriv, cs_type, chunks, num_chunks, 2209 1877 &cs_seq, args->in.cs_flags, timeout); 2210 1878 break; 1879 + case CS_RESERVE_SIGNALS: 1880 + rc = cs_ioctl_reserve_signals(hpriv, 1881 + args->in.encaps_signals_q_idx, 1882 + args->in.encaps_signals_count, 1883 + &handle_id, &sob_addr, &signals_count); 1884 + break; 1885 + case CS_UNRESERVE_SIGNALS: 1886 + rc = cs_ioctl_unreserve_signals(hpriv, 1887 + args->in.encaps_sig_handle_id); 1888 + break; 2211 1889 default: 2212 1890 rc = cs_ioctl_default(hpriv, chunks, num_chunks, &cs_seq, 2213 - args->in.cs_flags, timeout); 1891 + args->in.cs_flags, 1892 + args->in.encaps_sig_handle_id, 1893 + timeout); 2214 1894 break; 2215 1895 } 2216 - 2217 1896 out: 2218 1897 if (rc != -EAGAIN) { 2219 1898 memset(args, 0, sizeof(*args)); 1899 + 1900 + if (cs_type == CS_RESERVE_SIGNALS) { 1901 + args->out.handle_id = handle_id; 1902 + args->out.sob_base_addr_offset = sob_addr; 1903 + args->out.count = signals_count; 1904 + } else { 1905 + args->out.seq = cs_seq; 1906 + } 2220 1907 args->out.status = rc; 2221 - args->out.seq = cs_seq; 2222 1908 } 1909 + 1910 + return rc; 1911 + } 1912 + 1913 + static int hl_wait_for_fence(struct hl_ctx *ctx, u64 seq, struct hl_fence *fence, 1914 + enum hl_cs_wait_status *status, u64 timeout_us, 1915 + s64 *timestamp) 1916 + { 1917 + struct hl_device *hdev = ctx->hdev; 1918 + long completion_rc; 1919 + int rc = 0; 1920 + 1921 + if (IS_ERR(fence)) { 1922 + rc = PTR_ERR(fence); 1923 + if (rc == -EINVAL) 1924 + dev_notice_ratelimited(hdev->dev, 1925 + "Can't wait on CS %llu because current CS is at seq %llu\n", 1926 + seq, ctx->cs_sequence); 1927 + return rc; 1928 + } 1929 + 1930 + if (!fence) { 1931 + dev_dbg(hdev->dev, 1932 + "Can't wait on seq %llu because current CS is at seq %llu (Fence is gone)\n", 1933 + seq, ctx->cs_sequence); 1934 + 1935 + *status = CS_WAIT_STATUS_GONE; 1936 + return 0; 1937 + } 1938 + 1939 + if (!timeout_us) { 1940 + completion_rc = completion_done(&fence->completion); 1941 + } else { 1942 + unsigned long timeout; 1943 + 1944 + timeout = (timeout_us == MAX_SCHEDULE_TIMEOUT) ? 1945 + timeout_us : usecs_to_jiffies(timeout_us); 1946 + completion_rc = 1947 + wait_for_completion_interruptible_timeout( 1948 + &fence->completion, timeout); 1949 + } 1950 + 1951 + if (completion_rc > 0) { 1952 + *status = CS_WAIT_STATUS_COMPLETED; 1953 + if (timestamp) 1954 + *timestamp = ktime_to_ns(fence->timestamp); 1955 + } else { 1956 + *status = CS_WAIT_STATUS_BUSY; 1957 + } 1958 + 1959 + if (fence->error == -ETIMEDOUT) 1960 + rc = -ETIMEDOUT; 1961 + else if (fence->error == -EIO) 1962 + rc = -EIO; 1963 + 1964 + return rc; 1965 + } 1966 + 1967 + /* 1968 + * hl_cs_poll_fences - iterate CS fences to check for CS completion 1969 + * 1970 + * @mcs_data: multi-CS internal data 1971 + * 1972 + * @return 0 on success, otherwise non 0 error code 1973 + * 1974 + * The function iterates on all CS sequence in the list and set bit in 1975 + * completion_bitmap for each completed CS. 1976 + * while iterating, the function can extracts the stream map to be later 1977 + * used by the waiting function. 1978 + * this function shall be called after taking context ref 1979 + */ 1980 + static int hl_cs_poll_fences(struct multi_cs_data *mcs_data) 1981 + { 1982 + struct hl_fence **fence_ptr = mcs_data->fence_arr; 1983 + struct hl_device *hdev = mcs_data->ctx->hdev; 1984 + int i, rc, arr_len = mcs_data->arr_len; 1985 + u64 *seq_arr = mcs_data->seq_arr; 1986 + ktime_t max_ktime, first_cs_time; 1987 + enum hl_cs_wait_status status; 1988 + 1989 + memset(fence_ptr, 0, arr_len * sizeof(*fence_ptr)); 1990 + 1991 + /* get all fences under the same lock */ 1992 + rc = hl_ctx_get_fences(mcs_data->ctx, seq_arr, fence_ptr, arr_len); 1993 + if (rc) 1994 + return rc; 1995 + 1996 + /* 1997 + * set to maximum time to verify timestamp is valid: if at the end 1998 + * this value is maintained- no timestamp was updated 1999 + */ 2000 + max_ktime = ktime_set(KTIME_SEC_MAX, 0); 2001 + first_cs_time = max_ktime; 2002 + 2003 + for (i = 0; i < arr_len; i++, fence_ptr++) { 2004 + struct hl_fence *fence = *fence_ptr; 2005 + 2006 + /* 2007 + * function won't sleep as it is called with timeout 0 (i.e. 2008 + * poll the fence) 2009 + */ 2010 + rc = hl_wait_for_fence(mcs_data->ctx, seq_arr[i], fence, 2011 + &status, 0, NULL); 2012 + if (rc) { 2013 + dev_err(hdev->dev, 2014 + "wait_for_fence error :%d for CS seq %llu\n", 2015 + rc, seq_arr[i]); 2016 + break; 2017 + } 2018 + 2019 + mcs_data->stream_master_qid_map |= fence->stream_master_qid_map; 2020 + 2021 + if (status == CS_WAIT_STATUS_BUSY) 2022 + continue; 2023 + 2024 + mcs_data->completion_bitmap |= BIT(i); 2025 + 2026 + /* 2027 + * best effort to extract timestamp. few notes: 2028 + * - if even single fence is gone we cannot extract timestamp 2029 + * (as fence not exist anymore) 2030 + * - for all completed CSs we take the earliest timestamp. 2031 + * for this we have to validate that: 2032 + * 1. given timestamp was indeed set 2033 + * 2. the timestamp is earliest of all timestamps so far 2034 + */ 2035 + 2036 + if (status == CS_WAIT_STATUS_GONE) { 2037 + mcs_data->update_ts = false; 2038 + mcs_data->gone_cs = true; 2039 + } else if (mcs_data->update_ts && 2040 + (ktime_compare(fence->timestamp, 2041 + ktime_set(0, 0)) > 0) && 2042 + (ktime_compare(fence->timestamp, first_cs_time) < 0)) { 2043 + first_cs_time = fence->timestamp; 2044 + } 2045 + } 2046 + 2047 + hl_fences_put(mcs_data->fence_arr, arr_len); 2048 + 2049 + if (mcs_data->update_ts && 2050 + (ktime_compare(first_cs_time, max_ktime) != 0)) 2051 + mcs_data->timestamp = ktime_to_ns(first_cs_time); 2223 2052 2224 2053 return rc; 2225 2054 } ··· 2390 1897 enum hl_cs_wait_status *status, s64 *timestamp) 2391 1898 { 2392 1899 struct hl_fence *fence; 2393 - unsigned long timeout; 2394 1900 int rc = 0; 2395 - long completion_rc; 2396 1901 2397 1902 if (timestamp) 2398 1903 *timestamp = 0; 2399 1904 2400 - if (timeout_us == MAX_SCHEDULE_TIMEOUT) 2401 - timeout = timeout_us; 2402 - else 2403 - timeout = usecs_to_jiffies(timeout_us); 2404 - 2405 1905 hl_ctx_get(hdev, ctx); 2406 1906 2407 1907 fence = hl_ctx_get_fence(ctx, seq); 2408 - if (IS_ERR(fence)) { 2409 - rc = PTR_ERR(fence); 2410 - if (rc == -EINVAL) 2411 - dev_notice_ratelimited(hdev->dev, 2412 - "Can't wait on CS %llu because current CS is at seq %llu\n", 2413 - seq, ctx->cs_sequence); 2414 - } else if (fence) { 2415 - if (!timeout_us) 2416 - completion_rc = completion_done(&fence->completion); 2417 - else 2418 - completion_rc = 2419 - wait_for_completion_interruptible_timeout( 2420 - &fence->completion, timeout); 2421 1908 2422 - if (completion_rc > 0) { 2423 - *status = CS_WAIT_STATUS_COMPLETED; 2424 - if (timestamp) 2425 - *timestamp = ktime_to_ns(fence->timestamp); 2426 - } else { 2427 - *status = CS_WAIT_STATUS_BUSY; 2428 - } 2429 - 2430 - if (fence->error == -ETIMEDOUT) 2431 - rc = -ETIMEDOUT; 2432 - else if (fence->error == -EIO) 2433 - rc = -EIO; 2434 - 2435 - hl_fence_put(fence); 2436 - } else { 2437 - dev_dbg(hdev->dev, 2438 - "Can't wait on seq %llu because current CS is at seq %llu (Fence is gone)\n", 2439 - seq, ctx->cs_sequence); 2440 - *status = CS_WAIT_STATUS_GONE; 2441 - } 2442 - 1909 + rc = hl_wait_for_fence(ctx, seq, fence, status, timeout_us, timestamp); 1910 + hl_fence_put(fence); 2443 1911 hl_ctx_put(ctx); 2444 1912 2445 1913 return rc; 1914 + } 1915 + 1916 + /* 1917 + * hl_wait_multi_cs_completion_init - init completion structure 1918 + * 1919 + * @hdev: pointer to habanalabs device structure 1920 + * @stream_master_bitmap: stream master QIDs map, set bit indicates stream 1921 + * master QID to wait on 1922 + * 1923 + * @return valid completion struct pointer on success, otherwise error pointer 1924 + * 1925 + * up to MULTI_CS_MAX_USER_CTX calls can be done concurrently to the driver. 1926 + * the function gets the first available completion (by marking it "used") 1927 + * and initialize its values. 1928 + */ 1929 + static struct multi_cs_completion *hl_wait_multi_cs_completion_init( 1930 + struct hl_device *hdev, 1931 + u8 stream_master_bitmap) 1932 + { 1933 + struct multi_cs_completion *mcs_compl; 1934 + int i; 1935 + 1936 + /* find free multi_cs completion structure */ 1937 + for (i = 0; i < MULTI_CS_MAX_USER_CTX; i++) { 1938 + mcs_compl = &hdev->multi_cs_completion[i]; 1939 + spin_lock(&mcs_compl->lock); 1940 + if (!mcs_compl->used) { 1941 + mcs_compl->used = 1; 1942 + mcs_compl->timestamp = 0; 1943 + mcs_compl->stream_master_qid_map = stream_master_bitmap; 1944 + reinit_completion(&mcs_compl->completion); 1945 + spin_unlock(&mcs_compl->lock); 1946 + break; 1947 + } 1948 + spin_unlock(&mcs_compl->lock); 1949 + } 1950 + 1951 + if (i == MULTI_CS_MAX_USER_CTX) { 1952 + dev_err(hdev->dev, 1953 + "no available multi-CS completion structure\n"); 1954 + return ERR_PTR(-ENOMEM); 1955 + } 1956 + return mcs_compl; 1957 + } 1958 + 1959 + /* 1960 + * hl_wait_multi_cs_completion_fini - return completion structure and set as 1961 + * unused 1962 + * 1963 + * @mcs_compl: pointer to the completion structure 1964 + */ 1965 + static void hl_wait_multi_cs_completion_fini( 1966 + struct multi_cs_completion *mcs_compl) 1967 + { 1968 + /* 1969 + * free completion structure, do it under lock to be in-sync with the 1970 + * thread that signals completion 1971 + */ 1972 + spin_lock(&mcs_compl->lock); 1973 + mcs_compl->used = 0; 1974 + spin_unlock(&mcs_compl->lock); 1975 + } 1976 + 1977 + /* 1978 + * hl_wait_multi_cs_completion - wait for first CS to complete 1979 + * 1980 + * @mcs_data: multi-CS internal data 1981 + * 1982 + * @return 0 on success, otherwise non 0 error code 1983 + */ 1984 + static int hl_wait_multi_cs_completion(struct multi_cs_data *mcs_data) 1985 + { 1986 + struct hl_device *hdev = mcs_data->ctx->hdev; 1987 + struct multi_cs_completion *mcs_compl; 1988 + long completion_rc; 1989 + 1990 + mcs_compl = hl_wait_multi_cs_completion_init(hdev, 1991 + mcs_data->stream_master_qid_map); 1992 + if (IS_ERR(mcs_compl)) 1993 + return PTR_ERR(mcs_compl); 1994 + 1995 + completion_rc = wait_for_completion_interruptible_timeout( 1996 + &mcs_compl->completion, 1997 + usecs_to_jiffies(mcs_data->timeout_us)); 1998 + 1999 + /* update timestamp */ 2000 + if (completion_rc > 0) 2001 + mcs_data->timestamp = mcs_compl->timestamp; 2002 + 2003 + hl_wait_multi_cs_completion_fini(mcs_compl); 2004 + 2005 + mcs_data->wait_status = completion_rc; 2006 + 2007 + return 0; 2008 + } 2009 + 2010 + /* 2011 + * hl_multi_cs_completion_init - init array of multi-CS completion structures 2012 + * 2013 + * @hdev: pointer to habanalabs device structure 2014 + */ 2015 + void hl_multi_cs_completion_init(struct hl_device *hdev) 2016 + { 2017 + struct multi_cs_completion *mcs_cmpl; 2018 + int i; 2019 + 2020 + for (i = 0; i < MULTI_CS_MAX_USER_CTX; i++) { 2021 + mcs_cmpl = &hdev->multi_cs_completion[i]; 2022 + mcs_cmpl->used = 0; 2023 + spin_lock_init(&mcs_cmpl->lock); 2024 + init_completion(&mcs_cmpl->completion); 2025 + } 2026 + } 2027 + 2028 + /* 2029 + * hl_multi_cs_wait_ioctl - implementation of the multi-CS wait ioctl 2030 + * 2031 + * @hpriv: pointer to the private data of the fd 2032 + * @data: pointer to multi-CS wait ioctl in/out args 2033 + * 2034 + */ 2035 + static int hl_multi_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data) 2036 + { 2037 + struct hl_device *hdev = hpriv->hdev; 2038 + struct multi_cs_data mcs_data = {0}; 2039 + union hl_wait_cs_args *args = data; 2040 + struct hl_ctx *ctx = hpriv->ctx; 2041 + struct hl_fence **fence_arr; 2042 + void __user *seq_arr; 2043 + u32 size_to_copy; 2044 + u64 *cs_seq_arr; 2045 + u8 seq_arr_len; 2046 + int rc; 2047 + 2048 + if (!hdev->supports_wait_for_multi_cs) { 2049 + dev_err(hdev->dev, "Wait for multi CS is not supported\n"); 2050 + return -EPERM; 2051 + } 2052 + 2053 + seq_arr_len = args->in.seq_arr_len; 2054 + 2055 + if (seq_arr_len > HL_WAIT_MULTI_CS_LIST_MAX_LEN) { 2056 + dev_err(hdev->dev, "Can wait only up to %d CSs, input sequence is of length %u\n", 2057 + HL_WAIT_MULTI_CS_LIST_MAX_LEN, seq_arr_len); 2058 + return -EINVAL; 2059 + } 2060 + 2061 + /* allocate memory for sequence array */ 2062 + cs_seq_arr = 2063 + kmalloc_array(seq_arr_len, sizeof(*cs_seq_arr), GFP_KERNEL); 2064 + if (!cs_seq_arr) 2065 + return -ENOMEM; 2066 + 2067 + /* copy CS sequence array from user */ 2068 + seq_arr = (void __user *) (uintptr_t) args->in.seq; 2069 + size_to_copy = seq_arr_len * sizeof(*cs_seq_arr); 2070 + if (copy_from_user(cs_seq_arr, seq_arr, size_to_copy)) { 2071 + dev_err(hdev->dev, "Failed to copy multi-cs sequence array from user\n"); 2072 + rc = -EFAULT; 2073 + goto free_seq_arr; 2074 + } 2075 + 2076 + /* allocate array for the fences */ 2077 + fence_arr = kmalloc_array(seq_arr_len, sizeof(*fence_arr), GFP_KERNEL); 2078 + if (!fence_arr) { 2079 + rc = -ENOMEM; 2080 + goto free_seq_arr; 2081 + } 2082 + 2083 + /* initialize the multi-CS internal data */ 2084 + mcs_data.ctx = ctx; 2085 + mcs_data.seq_arr = cs_seq_arr; 2086 + mcs_data.fence_arr = fence_arr; 2087 + mcs_data.arr_len = seq_arr_len; 2088 + 2089 + hl_ctx_get(hdev, ctx); 2090 + 2091 + /* poll all CS fences, extract timestamp */ 2092 + mcs_data.update_ts = true; 2093 + rc = hl_cs_poll_fences(&mcs_data); 2094 + /* 2095 + * skip wait for CS completion when one of the below is true: 2096 + * - an error on the poll function 2097 + * - one or more CS in the list completed 2098 + * - the user called ioctl with timeout 0 2099 + */ 2100 + if (rc || mcs_data.completion_bitmap || !args->in.timeout_us) 2101 + goto put_ctx; 2102 + 2103 + /* wait (with timeout) for the first CS to be completed */ 2104 + mcs_data.timeout_us = args->in.timeout_us; 2105 + rc = hl_wait_multi_cs_completion(&mcs_data); 2106 + if (rc) 2107 + goto put_ctx; 2108 + 2109 + if (mcs_data.wait_status > 0) { 2110 + /* 2111 + * poll fences once again to update the CS map. 2112 + * no timestamp should be updated this time. 2113 + */ 2114 + mcs_data.update_ts = false; 2115 + rc = hl_cs_poll_fences(&mcs_data); 2116 + 2117 + /* 2118 + * if hl_wait_multi_cs_completion returned before timeout (i.e. 2119 + * it got a completion) we expect to see at least one CS 2120 + * completed after the poll function. 2121 + */ 2122 + if (!mcs_data.completion_bitmap) { 2123 + dev_err(hdev->dev, "Multi-CS got completion on wait but no CS completed\n"); 2124 + rc = -EFAULT; 2125 + } 2126 + } 2127 + 2128 + put_ctx: 2129 + hl_ctx_put(ctx); 2130 + kfree(fence_arr); 2131 + 2132 + free_seq_arr: 2133 + kfree(cs_seq_arr); 2134 + 2135 + /* update output args */ 2136 + memset(args, 0, sizeof(*args)); 2137 + if (rc) 2138 + return rc; 2139 + 2140 + if (mcs_data.completion_bitmap) { 2141 + args->out.status = HL_WAIT_CS_STATUS_COMPLETED; 2142 + args->out.cs_completion_map = mcs_data.completion_bitmap; 2143 + 2144 + /* if timestamp not 0- it's valid */ 2145 + if (mcs_data.timestamp) { 2146 + args->out.timestamp_nsec = mcs_data.timestamp; 2147 + args->out.flags |= HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD; 2148 + } 2149 + 2150 + /* update if some CS was gone */ 2151 + if (mcs_data.timestamp) 2152 + args->out.flags |= HL_WAIT_CS_STATUS_FLAG_GONE; 2153 + } else if (mcs_data.wait_status == -ERESTARTSYS) { 2154 + args->out.status = HL_WAIT_CS_STATUS_INTERRUPTED; 2155 + } else { 2156 + args->out.status = HL_WAIT_CS_STATUS_BUSY; 2157 + } 2158 + 2159 + return 0; 2446 2160 } 2447 2161 2448 2162 static int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data) ··· 2715 2015 { 2716 2016 struct hl_user_pending_interrupt *pend; 2717 2017 struct hl_user_interrupt *interrupt; 2718 - unsigned long timeout; 2719 - long completion_rc; 2018 + unsigned long timeout, flags; 2720 2019 u32 completion_value; 2020 + long completion_rc; 2721 2021 int rc = 0; 2722 2022 2723 2023 if (timeout_us == U32_MAX) ··· 2740 2040 else 2741 2041 interrupt = &hdev->user_interrupt[interrupt_offset]; 2742 2042 2743 - spin_lock(&interrupt->wait_list_lock); 2744 - if (!hl_device_operational(hdev, NULL)) { 2745 - rc = -EPERM; 2746 - goto unlock_and_free_fence; 2747 - } 2748 - 2749 2043 if (copy_from_user(&completion_value, u64_to_user_ptr(user_address), 4)) { 2750 - dev_err(hdev->dev, 2751 - "Failed to copy completion value from user\n"); 2044 + dev_err(hdev->dev, "Failed to copy completion value from user\n"); 2752 2045 rc = -EFAULT; 2753 - goto unlock_and_free_fence; 2046 + goto free_fence; 2754 2047 } 2755 2048 2756 2049 if (completion_value >= target_value) ··· 2752 2059 *status = CS_WAIT_STATUS_BUSY; 2753 2060 2754 2061 if (!timeout_us || (*status == CS_WAIT_STATUS_COMPLETED)) 2755 - goto unlock_and_free_fence; 2062 + goto free_fence; 2756 2063 2757 2064 /* Add pending user interrupt to relevant list for the interrupt 2758 2065 * handler to monitor 2759 2066 */ 2067 + spin_lock_irqsave(&interrupt->wait_list_lock, flags); 2760 2068 list_add_tail(&pend->wait_list_node, &interrupt->wait_list_head); 2761 - spin_unlock(&interrupt->wait_list_lock); 2069 + spin_unlock_irqrestore(&interrupt->wait_list_lock, flags); 2762 2070 2763 2071 wait_again: 2764 2072 /* Wait for interrupt handler to signal completion */ 2765 - completion_rc = 2766 - wait_for_completion_interruptible_timeout( 2767 - &pend->fence.completion, timeout); 2073 + completion_rc = wait_for_completion_interruptible_timeout(&pend->fence.completion, 2074 + timeout); 2768 2075 2769 2076 /* If timeout did not expire we need to perform the comparison. 2770 2077 * If comparison fails, keep waiting until timeout expires 2771 2078 */ 2772 2079 if (completion_rc > 0) { 2773 - if (copy_from_user(&completion_value, 2774 - u64_to_user_ptr(user_address), 4)) { 2775 - dev_err(hdev->dev, 2776 - "Failed to copy completion value from user\n"); 2080 + if (copy_from_user(&completion_value, u64_to_user_ptr(user_address), 4)) { 2081 + dev_err(hdev->dev, "Failed to copy completion value from user\n"); 2777 2082 rc = -EFAULT; 2083 + 2778 2084 goto remove_pending_user_interrupt; 2779 2085 } 2780 2086 2781 2087 if (completion_value >= target_value) { 2782 2088 *status = CS_WAIT_STATUS_COMPLETED; 2783 2089 } else { 2090 + spin_lock_irqsave(&interrupt->wait_list_lock, flags); 2091 + reinit_completion(&pend->fence.completion); 2784 2092 timeout = completion_rc; 2093 + 2094 + spin_unlock_irqrestore(&interrupt->wait_list_lock, flags); 2785 2095 goto wait_again; 2786 2096 } 2097 + } else if (completion_rc == -ERESTARTSYS) { 2098 + dev_err_ratelimited(hdev->dev, 2099 + "user process got signal while waiting for interrupt ID %d\n", 2100 + interrupt->interrupt_id); 2101 + *status = HL_WAIT_CS_STATUS_INTERRUPTED; 2102 + rc = -EINTR; 2787 2103 } else { 2788 2104 *status = CS_WAIT_STATUS_BUSY; 2789 2105 } 2790 2106 2791 2107 remove_pending_user_interrupt: 2792 - spin_lock(&interrupt->wait_list_lock); 2108 + spin_lock_irqsave(&interrupt->wait_list_lock, flags); 2793 2109 list_del(&pend->wait_list_node); 2110 + spin_unlock_irqrestore(&interrupt->wait_list_lock, flags); 2794 2111 2795 - unlock_and_free_fence: 2796 - spin_unlock(&interrupt->wait_list_lock); 2112 + free_fence: 2797 2113 kfree(pend); 2798 2114 hl_ctx_put(ctx); 2799 2115 ··· 2850 2148 memset(args, 0, sizeof(*args)); 2851 2149 2852 2150 if (rc) { 2853 - dev_err_ratelimited(hdev->dev, 2854 - "interrupt_wait_ioctl failed (%d)\n", rc); 2151 + if (rc != -EINTR) 2152 + dev_err_ratelimited(hdev->dev, 2153 + "interrupt_wait_ioctl failed (%d)\n", rc); 2855 2154 2856 2155 return rc; 2857 2156 } ··· 2876 2173 u32 flags = args->in.flags; 2877 2174 int rc; 2878 2175 2176 + /* If the device is not operational, no point in waiting for any command submission or 2177 + * user interrupt 2178 + */ 2179 + if (!hl_device_operational(hpriv->hdev, NULL)) 2180 + return -EPERM; 2181 + 2879 2182 if (flags & HL_WAIT_CS_FLAGS_INTERRUPT) 2880 2183 rc = hl_interrupt_wait_ioctl(hpriv, data); 2184 + else if (flags & HL_WAIT_CS_FLAGS_MULTI_CS) 2185 + rc = hl_multi_cs_wait_ioctl(hpriv, data); 2881 2186 else 2882 2187 rc = hl_cs_wait_ioctl(hpriv, data); 2883 2188

+126 -20

drivers/misc/habanalabs/common/context.c

··· 9 9 10 10 #include <linux/slab.h> 11 11 12 + void hl_encaps_handle_do_release(struct kref *ref) 13 + { 14 + struct hl_cs_encaps_sig_handle *handle = 15 + container_of(ref, struct hl_cs_encaps_sig_handle, refcount); 16 + struct hl_ctx *ctx = handle->hdev->compute_ctx; 17 + struct hl_encaps_signals_mgr *mgr = &ctx->sig_mgr; 18 + 19 + spin_lock(&mgr->lock); 20 + idr_remove(&mgr->handles, handle->id); 21 + spin_unlock(&mgr->lock); 22 + 23 + kfree(handle); 24 + } 25 + 26 + static void hl_encaps_handle_do_release_sob(struct kref *ref) 27 + { 28 + struct hl_cs_encaps_sig_handle *handle = 29 + container_of(ref, struct hl_cs_encaps_sig_handle, refcount); 30 + struct hl_ctx *ctx = handle->hdev->compute_ctx; 31 + struct hl_encaps_signals_mgr *mgr = &ctx->sig_mgr; 32 + 33 + /* if we're here, then there was a signals reservation but cs with 34 + * encaps signals wasn't submitted, so need to put refcount 35 + * to hw_sob taken at the reservation. 36 + */ 37 + hw_sob_put(handle->hw_sob); 38 + 39 + spin_lock(&mgr->lock); 40 + idr_remove(&mgr->handles, handle->id); 41 + spin_unlock(&mgr->lock); 42 + 43 + kfree(handle); 44 + } 45 + 46 + static void hl_encaps_sig_mgr_init(struct hl_encaps_signals_mgr *mgr) 47 + { 48 + spin_lock_init(&mgr->lock); 49 + idr_init(&mgr->handles); 50 + } 51 + 52 + static void hl_encaps_sig_mgr_fini(struct hl_device *hdev, 53 + struct hl_encaps_signals_mgr *mgr) 54 + { 55 + struct hl_cs_encaps_sig_handle *handle; 56 + struct idr *idp; 57 + u32 id; 58 + 59 + idp = &mgr->handles; 60 + 61 + if (!idr_is_empty(idp)) { 62 + dev_warn(hdev->dev, "device released while some encaps signals handles are still allocated\n"); 63 + idr_for_each_entry(idp, handle, id) 64 + kref_put(&handle->refcount, 65 + hl_encaps_handle_do_release_sob); 66 + } 67 + 68 + idr_destroy(&mgr->handles); 69 + } 70 + 12 71 static void hl_ctx_fini(struct hl_ctx *ctx) 13 72 { 14 73 struct hl_device *hdev = ctx->hdev; 15 74 int i; 16 - 17 - /* Release all allocated pending cb's, those cb's were never 18 - * scheduled so it is safe to release them here 19 - */ 20 - hl_pending_cb_list_flush(ctx); 21 75 22 76 /* Release all allocated HW block mapped list entries and destroy 23 77 * the mutex. ··· 107 53 hl_cb_va_pool_fini(ctx); 108 54 hl_vm_ctx_fini(ctx); 109 55 hl_asid_free(hdev, ctx->asid); 56 + hl_encaps_sig_mgr_fini(hdev, &ctx->sig_mgr); 110 57 111 58 /* Scrub both SRAM and DRAM */ 112 59 hdev->asic_funcs->scrub_device_mem(hdev, 0, 0); ··· 185 130 { 186 131 if (kref_put(&ctx->refcount, hl_ctx_do_release) == 1) 187 132 return; 188 - 189 - dev_warn(hdev->dev, 190 - "user process released device but its command submissions are still executing\n"); 191 133 } 192 134 193 135 int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx) ··· 196 144 kref_init(&ctx->refcount); 197 145 198 146 ctx->cs_sequence = 1; 199 - INIT_LIST_HEAD(&ctx->pending_cb_list); 200 - spin_lock_init(&ctx->pending_cb_lock); 201 147 spin_lock_init(&ctx->cs_lock); 202 148 atomic_set(&ctx->thread_ctx_switch_token, 1); 203 - atomic_set(&ctx->thread_pending_cb_token, 1); 204 149 ctx->thread_ctx_switch_wait_token = 0; 205 150 ctx->cs_pending = kcalloc(hdev->asic_prop.max_pending_cs, 206 151 sizeof(struct hl_fence *), ··· 249 200 goto err_cb_va_pool_fini; 250 201 } 251 202 203 + hl_encaps_sig_mgr_init(&ctx->sig_mgr); 204 + 252 205 dev_dbg(hdev->dev, "create user context %d\n", ctx->asid); 253 206 } 254 207 ··· 280 229 return kref_put(&ctx->refcount, hl_ctx_do_release); 281 230 } 282 231 283 - struct hl_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq) 232 + /* 233 + * hl_ctx_get_fence_locked - get CS fence under CS lock 234 + * 235 + * @ctx: pointer to the context structure. 236 + * @seq: CS sequences number 237 + * 238 + * @return valid fence pointer on success, NULL if fence is gone, otherwise 239 + * error pointer. 240 + * 241 + * NOTE: this function shall be called with cs_lock locked 242 + */ 243 + static struct hl_fence *hl_ctx_get_fence_locked(struct hl_ctx *ctx, u64 seq) 284 244 { 285 245 struct asic_fixed_properties *asic_prop = &ctx->hdev->asic_prop; 286 246 struct hl_fence *fence; 287 247 288 - spin_lock(&ctx->cs_lock); 289 - 290 - if (seq >= ctx->cs_sequence) { 291 - spin_unlock(&ctx->cs_lock); 248 + if (seq >= ctx->cs_sequence) 292 249 return ERR_PTR(-EINVAL); 293 - } 294 250 295 - if (seq + asic_prop->max_pending_cs < ctx->cs_sequence) { 296 - spin_unlock(&ctx->cs_lock); 251 + if (seq + asic_prop->max_pending_cs < ctx->cs_sequence) 297 252 return NULL; 298 - } 299 253 300 254 fence = ctx->cs_pending[seq & (asic_prop->max_pending_cs - 1)]; 301 255 hl_fence_get(fence); 256 + return fence; 257 + } 258 + 259 + struct hl_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq) 260 + { 261 + struct hl_fence *fence; 262 + 263 + spin_lock(&ctx->cs_lock); 264 + 265 + fence = hl_ctx_get_fence_locked(ctx, seq); 302 266 303 267 spin_unlock(&ctx->cs_lock); 304 268 305 269 return fence; 270 + } 271 + 272 + /* 273 + * hl_ctx_get_fences - get multiple CS fences under the same CS lock 274 + * 275 + * @ctx: pointer to the context structure. 276 + * @seq_arr: array of CS sequences to wait for 277 + * @fence: fence array to store the CS fences 278 + * @arr_len: length of seq_arr and fence_arr 279 + * 280 + * @return 0 on success, otherwise non 0 error code 281 + */ 282 + int hl_ctx_get_fences(struct hl_ctx *ctx, u64 *seq_arr, 283 + struct hl_fence **fence, u32 arr_len) 284 + { 285 + struct hl_fence **fence_arr_base = fence; 286 + int i, rc = 0; 287 + 288 + spin_lock(&ctx->cs_lock); 289 + 290 + for (i = 0; i < arr_len; i++, fence++) { 291 + u64 seq = seq_arr[i]; 292 + 293 + *fence = hl_ctx_get_fence_locked(ctx, seq); 294 + 295 + if (IS_ERR(*fence)) { 296 + dev_err(ctx->hdev->dev, 297 + "Failed to get fence for CS with seq 0x%llx\n", 298 + seq); 299 + rc = PTR_ERR(*fence); 300 + break; 301 + } 302 + } 303 + 304 + spin_unlock(&ctx->cs_lock); 305 + 306 + if (rc) 307 + hl_fences_put(fence_arr_base, i); 308 + 309 + return rc; 306 310 } 307 311 308 312 /*

+174 -10

drivers/misc/habanalabs/common/debugfs.c

··· 209 209 if (first) { 210 210 first = false; 211 211 seq_puts(s, "\n"); 212 - seq_puts(s, " user virtual address size dma dir\n"); 212 + seq_puts(s, " pid user virtual address size dma dir\n"); 213 213 seq_puts(s, "----------------------------------------------------------\n"); 214 214 } 215 - seq_printf(s, 216 - " 0x%-14llx %-10u %-30s\n", 217 - userptr->addr, userptr->size, dma_dir[userptr->dir]); 215 + seq_printf(s, " %-7d 0x%-14llx %-10llu %-30s\n", 216 + userptr->pid, userptr->addr, userptr->size, 217 + dma_dir[userptr->dir]); 218 218 } 219 219 220 220 spin_unlock(&dev_entry->userptr_spinlock); ··· 235 235 struct hl_vm_hash_node *hnode; 236 236 struct hl_userptr *userptr; 237 237 struct hl_vm_phys_pg_pack *phys_pg_pack = NULL; 238 - enum vm_type_t *vm_type; 238 + enum vm_type *vm_type; 239 239 bool once = true; 240 240 u64 j; 241 241 int i; ··· 261 261 if (*vm_type == VM_TYPE_USERPTR) { 262 262 userptr = hnode->ptr; 263 263 seq_printf(s, 264 - " 0x%-14llx %-10u\n", 264 + " 0x%-14llx %-10llu\n", 265 265 hnode->vaddr, userptr->size); 266 266 } else { 267 267 phys_pg_pack = hnode->ptr; ··· 320 320 return 0; 321 321 } 322 322 323 + static int userptr_lookup_show(struct seq_file *s, void *data) 324 + { 325 + struct hl_debugfs_entry *entry = s->private; 326 + struct hl_dbg_device_entry *dev_entry = entry->dev_entry; 327 + struct scatterlist *sg; 328 + struct hl_userptr *userptr; 329 + bool first = true; 330 + u64 total_npages, npages, sg_start, sg_end; 331 + dma_addr_t dma_addr; 332 + int i; 333 + 334 + spin_lock(&dev_entry->userptr_spinlock); 335 + 336 + list_for_each_entry(userptr, &dev_entry->userptr_list, debugfs_list) { 337 + if (dev_entry->userptr_lookup >= userptr->addr && 338 + dev_entry->userptr_lookup < userptr->addr + userptr->size) { 339 + total_npages = 0; 340 + for_each_sg(userptr->sgt->sgl, sg, userptr->sgt->nents, 341 + i) { 342 + npages = hl_get_sg_info(sg, &dma_addr); 343 + sg_start = userptr->addr + 344 + total_npages * PAGE_SIZE; 345 + sg_end = userptr->addr + 346 + (total_npages + npages) * PAGE_SIZE; 347 + 348 + if (dev_entry->userptr_lookup >= sg_start && 349 + dev_entry->userptr_lookup < sg_end) { 350 + dma_addr += (dev_entry->userptr_lookup - 351 + sg_start); 352 + if (first) { 353 + first = false; 354 + seq_puts(s, "\n"); 355 + seq_puts(s, " user virtual address dma address pid region start region size\n"); 356 + seq_puts(s, "---------------------------------------------------------------------------------------\n"); 357 + } 358 + seq_printf(s, " 0x%-18llx 0x%-16llx %-8u 0x%-16llx %-12llu\n", 359 + dev_entry->userptr_lookup, 360 + (u64)dma_addr, userptr->pid, 361 + userptr->addr, userptr->size); 362 + } 363 + total_npages += npages; 364 + } 365 + } 366 + } 367 + 368 + spin_unlock(&dev_entry->userptr_spinlock); 369 + 370 + if (!first) 371 + seq_puts(s, "\n"); 372 + 373 + return 0; 374 + } 375 + 376 + static ssize_t userptr_lookup_write(struct file *file, const char __user *buf, 377 + size_t count, loff_t *f_pos) 378 + { 379 + struct seq_file *s = file->private_data; 380 + struct hl_debugfs_entry *entry = s->private; 381 + struct hl_dbg_device_entry *dev_entry = entry->dev_entry; 382 + ssize_t rc; 383 + u64 value; 384 + 385 + rc = kstrtoull_from_user(buf, count, 16, &value); 386 + if (rc) 387 + return rc; 388 + 389 + dev_entry->userptr_lookup = value; 390 + 391 + return count; 392 + } 393 + 323 394 static int mmu_show(struct seq_file *s, void *data) 324 395 { 325 396 struct hl_debugfs_entry *entry = s->private; ··· 420 349 return 0; 421 350 } 422 351 423 - phys_addr = hops_info.hop_info[hops_info.used_hops - 1].hop_pte_val; 352 + hl_mmu_va_to_pa(ctx, virt_addr, &phys_addr); 424 353 425 354 if (hops_info.scrambled_vaddr && 426 355 (dev_entry->mmu_addr != hops_info.scrambled_vaddr)) ··· 562 491 struct hl_vm_phys_pg_pack *phys_pg_pack; 563 492 struct hl_ctx *ctx = hdev->compute_ctx; 564 493 struct hl_vm_hash_node *hnode; 494 + u64 end_address, range_size; 565 495 struct hl_userptr *userptr; 566 - enum vm_type_t *vm_type; 496 + enum vm_type *vm_type; 567 497 bool valid = false; 568 - u64 end_address; 569 - u32 range_size; 570 498 int i, rc = 0; 571 499 572 500 if (!ctx) { ··· 1113 1043 return 0; 1114 1044 } 1115 1045 1046 + static ssize_t hl_state_dump_read(struct file *f, char __user *buf, 1047 + size_t count, loff_t *ppos) 1048 + { 1049 + struct hl_dbg_device_entry *entry = file_inode(f)->i_private; 1050 + ssize_t rc; 1051 + 1052 + down_read(&entry->state_dump_sem); 1053 + if (!entry->state_dump[entry->state_dump_head]) 1054 + rc = 0; 1055 + else 1056 + rc = simple_read_from_buffer( 1057 + buf, count, ppos, 1058 + entry->state_dump[entry->state_dump_head], 1059 + strlen(entry->state_dump[entry->state_dump_head])); 1060 + up_read(&entry->state_dump_sem); 1061 + 1062 + return rc; 1063 + } 1064 + 1065 + static ssize_t hl_state_dump_write(struct file *f, const char __user *buf, 1066 + size_t count, loff_t *ppos) 1067 + { 1068 + struct hl_dbg_device_entry *entry = file_inode(f)->i_private; 1069 + struct hl_device *hdev = entry->hdev; 1070 + ssize_t rc; 1071 + u32 size; 1072 + int i; 1073 + 1074 + rc = kstrtouint_from_user(buf, count, 10, &size); 1075 + if (rc) 1076 + return rc; 1077 + 1078 + if (size <= 0 || size >= ARRAY_SIZE(entry->state_dump)) { 1079 + dev_err(hdev->dev, "Invalid number of dumps to skip\n"); 1080 + return -EINVAL; 1081 + } 1082 + 1083 + if (entry->state_dump[entry->state_dump_head]) { 1084 + down_write(&entry->state_dump_sem); 1085 + for (i = 0; i < size; ++i) { 1086 + vfree(entry->state_dump[entry->state_dump_head]); 1087 + entry->state_dump[entry->state_dump_head] = NULL; 1088 + if (entry->state_dump_head > 0) 1089 + entry->state_dump_head--; 1090 + else 1091 + entry->state_dump_head = 1092 + ARRAY_SIZE(entry->state_dump) - 1; 1093 + } 1094 + up_write(&entry->state_dump_sem); 1095 + } 1096 + 1097 + return count; 1098 + } 1099 + 1116 1100 static const struct file_operations hl_data32b_fops = { 1117 1101 .owner = THIS_MODULE, 1118 1102 .read = hl_data_read32, ··· 1234 1110 .read = hl_security_violations_read 1235 1111 }; 1236 1112 1113 + static const struct file_operations hl_state_dump_fops = { 1114 + .owner = THIS_MODULE, 1115 + .read = hl_state_dump_read, 1116 + .write = hl_state_dump_write 1117 + }; 1118 + 1237 1119 static const struct hl_info_list hl_debugfs_list[] = { 1238 1120 {"command_buffers", command_buffers_show, NULL}, 1239 1121 {"command_submission", command_submission_show, NULL}, 1240 1122 {"command_submission_jobs", command_submission_jobs_show, NULL}, 1241 1123 {"userptr", userptr_show, NULL}, 1242 1124 {"vm", vm_show, NULL}, 1125 + {"userptr_lookup", userptr_lookup_show, userptr_lookup_write}, 1243 1126 {"mmu", mmu_show, mmu_asid_va_write}, 1244 1127 {"engines", engines_show, NULL} 1245 1128 }; ··· 1303 1172 INIT_LIST_HEAD(&dev_entry->userptr_list); 1304 1173 INIT_LIST_HEAD(&dev_entry->ctx_mem_hash_list); 1305 1174 mutex_init(&dev_entry->file_mutex); 1175 + init_rwsem(&dev_entry->state_dump_sem); 1306 1176 spin_lock_init(&dev_entry->cb_spinlock); 1307 1177 spin_lock_init(&dev_entry->cs_spinlock); 1308 1178 spin_lock_init(&dev_entry->cs_job_spinlock); ··· 1415 1283 dev_entry->root, 1416 1284 &hdev->skip_reset_on_timeout); 1417 1285 1286 + debugfs_create_file("state_dump", 1287 + 0600, 1288 + dev_entry->root, 1289 + dev_entry, 1290 + &hl_state_dump_fops); 1291 + 1418 1292 for (i = 0, entry = dev_entry->entry_arr ; i < count ; i++, entry++) { 1419 1293 debugfs_create_file(hl_debugfs_list[i].name, 1420 1294 0444, ··· 1435 1297 void hl_debugfs_remove_device(struct hl_device *hdev) 1436 1298 { 1437 1299 struct hl_dbg_device_entry *entry = &hdev->hl_debugfs; 1300 + int i; 1438 1301 1439 1302 debugfs_remove_recursive(entry->root); 1440 1303 1441 1304 mutex_destroy(&entry->file_mutex); 1442 1305 1443 1306 vfree(entry->blob_desc.data); 1307 + 1308 + for (i = 0; i < ARRAY_SIZE(entry->state_dump); ++i) 1309 + vfree(entry->state_dump[i]); 1444 1310 1445 1311 kfree(entry->entry_arr); 1446 1312 } ··· 1556 1414 spin_lock(&dev_entry->ctx_mem_hash_spinlock); 1557 1415 list_del(&ctx->debugfs_list); 1558 1416 spin_unlock(&dev_entry->ctx_mem_hash_spinlock); 1417 + } 1418 + 1419 + /** 1420 + * hl_debugfs_set_state_dump - register state dump making it accessible via 1421 + * debugfs 1422 + * @hdev: pointer to the device structure 1423 + * @data: the actual dump data 1424 + * @length: the length of the data 1425 + */ 1426 + void hl_debugfs_set_state_dump(struct hl_device *hdev, char *data, 1427 + unsigned long length) 1428 + { 1429 + struct hl_dbg_device_entry *dev_entry = &hdev->hl_debugfs; 1430 + 1431 + down_write(&dev_entry->state_dump_sem); 1432 + 1433 + dev_entry->state_dump_head = (dev_entry->state_dump_head + 1) % 1434 + ARRAY_SIZE(dev_entry->state_dump); 1435 + vfree(dev_entry->state_dump[dev_entry->state_dump_head]); 1436 + dev_entry->state_dump[dev_entry->state_dump_head] = data; 1437 + 1438 + up_write(&dev_entry->state_dump_sem); 1559 1439 } 1560 1440 1561 1441 void __init hl_debugfs_init(void)

+86 -77

drivers/misc/habanalabs/common/device.c

··· 7 7 8 8 #define pr_fmt(fmt) "habanalabs: " fmt 9 9 10 + #include <uapi/misc/habanalabs.h> 10 11 #include "habanalabs.h" 11 12 12 13 #include <linux/pci.h> 13 14 #include <linux/hwmon.h> 14 - #include <uapi/misc/habanalabs.h> 15 15 16 16 enum hl_device_status hl_device_status(struct hl_device *hdev) 17 17 { ··· 23 23 status = HL_DEVICE_STATUS_NEEDS_RESET; 24 24 else if (hdev->disabled) 25 25 status = HL_DEVICE_STATUS_MALFUNCTION; 26 + else if (!hdev->init_done) 27 + status = HL_DEVICE_STATUS_IN_DEVICE_CREATION; 26 28 else 27 29 status = HL_DEVICE_STATUS_OPERATIONAL; 28 30 ··· 46 44 case HL_DEVICE_STATUS_NEEDS_RESET: 47 45 return false; 48 46 case HL_DEVICE_STATUS_OPERATIONAL: 47 + case HL_DEVICE_STATUS_IN_DEVICE_CREATION: 49 48 default: 50 49 return true; 51 50 } ··· 132 129 hl_ctx_mgr_fini(hdev, &hpriv->ctx_mgr); 133 130 134 131 if (!hl_hpriv_put(hpriv)) 135 - dev_warn(hdev->dev, 136 - "Device is still in use because there are live CS and/or memory mappings\n"); 132 + dev_notice(hdev->dev, 133 + "User process closed FD but device still in use\n"); 137 134 138 135 hdev->last_open_session_duration_jif = 139 136 jiffies - hdev->last_successful_open_jif; ··· 311 308 container_of(work, struct hl_device_reset_work, 312 309 reset_work.work); 313 310 struct hl_device *hdev = device_reset_work->hdev; 311 + u32 flags; 314 312 int rc; 315 313 316 - rc = hl_device_reset(hdev, HL_RESET_HARD | HL_RESET_FROM_RESET_THREAD); 314 + flags = HL_RESET_HARD | HL_RESET_FROM_RESET_THREAD; 315 + 316 + if (device_reset_work->fw_reset) 317 + flags |= HL_RESET_FW; 318 + 319 + rc = hl_device_reset(hdev, flags); 317 320 if ((rc == -EBUSY) && !hdev->device_fini_pending) { 318 321 dev_info(hdev->dev, 319 322 "Could not reset device. will try again in %u seconds", ··· 691 682 return rc; 692 683 } 693 684 685 + static void take_release_locks(struct hl_device *hdev) 686 + { 687 + /* Flush anyone that is inside the critical section of enqueue 688 + * jobs to the H/W 689 + */ 690 + hdev->asic_funcs->hw_queues_lock(hdev); 691 + hdev->asic_funcs->hw_queues_unlock(hdev); 692 + 693 + /* Flush processes that are sending message to CPU */ 694 + mutex_lock(&hdev->send_cpu_message_lock); 695 + mutex_unlock(&hdev->send_cpu_message_lock); 696 + 697 + /* Flush anyone that is inside device open */ 698 + mutex_lock(&hdev->fpriv_list_lock); 699 + mutex_unlock(&hdev->fpriv_list_lock); 700 + } 701 + 702 + static void cleanup_resources(struct hl_device *hdev, bool hard_reset, bool fw_reset) 703 + { 704 + if (hard_reset) 705 + device_late_fini(hdev); 706 + 707 + /* 708 + * Halt the engines and disable interrupts so we won't get any more 709 + * completions from H/W and we won't have any accesses from the 710 + * H/W to the host machine 711 + */ 712 + hdev->asic_funcs->halt_engines(hdev, hard_reset, fw_reset); 713 + 714 + /* Go over all the queues, release all CS and their jobs */ 715 + hl_cs_rollback_all(hdev); 716 + 717 + /* Release all pending user interrupts, each pending user interrupt 718 + * holds a reference to user context 719 + */ 720 + hl_release_pending_user_interrupts(hdev); 721 + } 722 + 694 723 /* 695 724 * hl_device_suspend - initiate device suspend 696 725 * ··· 754 707 /* This blocks all other stuff that is not blocked by in_reset */ 755 708 hdev->disabled = true; 756 709 757 - /* 758 - * Flush anyone that is inside the critical section of enqueue 759 - * jobs to the H/W 760 - */ 761 - hdev->asic_funcs->hw_queues_lock(hdev); 762 - hdev->asic_funcs->hw_queues_unlock(hdev); 763 - 764 - /* Flush processes that are sending message to CPU */ 765 - mutex_lock(&hdev->send_cpu_message_lock); 766 - mutex_unlock(&hdev->send_cpu_message_lock); 710 + take_release_locks(hdev); 767 711 768 712 rc = hdev->asic_funcs->suspend(hdev); 769 713 if (rc) ··· 857 819 usleep_range(1000, 10000); 858 820 859 821 put_task_struct(task); 822 + } else { 823 + dev_warn(hdev->dev, 824 + "Can't get task struct for PID so giving up on killing process\n"); 825 + mutex_unlock(&hdev->fpriv_list_lock); 826 + return -ETIME; 860 827 } 861 828 } 862 829 ··· 928 885 int hl_device_reset(struct hl_device *hdev, u32 flags) 929 886 { 930 887 u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0}; 931 - bool hard_reset, from_hard_reset_thread, hard_instead_soft = false; 888 + bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false; 932 889 int i, rc; 933 890 934 891 if (!hdev->init_done) { ··· 937 894 return 0; 938 895 } 939 896 940 - hard_reset = (flags & HL_RESET_HARD) != 0; 941 - from_hard_reset_thread = (flags & HL_RESET_FROM_RESET_THREAD) != 0; 897 + hard_reset = !!(flags & HL_RESET_HARD); 898 + from_hard_reset_thread = !!(flags & HL_RESET_FROM_RESET_THREAD); 899 + fw_reset = !!(flags & HL_RESET_FW); 942 900 943 901 if (!hard_reset && !hdev->supports_soft_reset) { 944 902 hard_instead_soft = true; ··· 991 947 else 992 948 hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN; 993 949 994 - /* 995 - * if reset is due to heartbeat, device CPU is no responsive in 996 - * which case no point sending PCI disable message to it 950 + /* If reset is due to heartbeat, device CPU is no responsive in 951 + * which case no point sending PCI disable message to it. 952 + * 953 + * If F/W is performing the reset, no need to send it a message to disable 954 + * PCI access 997 955 */ 998 - if (hard_reset && !(flags & HL_RESET_HEARTBEAT)) { 956 + if (hard_reset && !(flags & (HL_RESET_HEARTBEAT | HL_RESET_FW))) { 999 957 /* Disable PCI access from device F/W so he won't send 1000 958 * us additional interrupts. We disable MSI/MSI-X at 1001 959 * the halt_engines function and we can't have the F/W ··· 1016 970 /* This also blocks future CS/VM/JOB completion operations */ 1017 971 hdev->disabled = true; 1018 972 1019 - /* Flush anyone that is inside the critical section of enqueue 1020 - * jobs to the H/W 1021 - */ 1022 - hdev->asic_funcs->hw_queues_lock(hdev); 1023 - hdev->asic_funcs->hw_queues_unlock(hdev); 1024 - 1025 - /* Flush anyone that is inside device open */ 1026 - mutex_lock(&hdev->fpriv_list_lock); 1027 - mutex_unlock(&hdev->fpriv_list_lock); 973 + take_release_locks(hdev); 1028 974 1029 975 dev_err(hdev->dev, "Going to RESET device!\n"); 1030 976 } ··· 1026 988 hdev->hard_reset_pending = true; 1027 989 1028 990 hdev->process_kill_trial_cnt = 0; 991 + 992 + hdev->device_reset_work.fw_reset = fw_reset; 1029 993 1030 994 /* 1031 995 * Because the reset function can't run from heartbeat work, ··· 1039 999 return 0; 1040 1000 } 1041 1001 1042 - if (hard_reset) { 1043 - device_late_fini(hdev); 1044 - 1045 - /* 1046 - * Now that the heartbeat thread is closed, flush processes 1047 - * which are sending messages to CPU 1048 - */ 1049 - mutex_lock(&hdev->send_cpu_message_lock); 1050 - mutex_unlock(&hdev->send_cpu_message_lock); 1051 - } 1052 - 1053 - /* 1054 - * Halt the engines and disable interrupts so we won't get any more 1055 - * completions from H/W and we won't have any accesses from the 1056 - * H/W to the host machine 1057 - */ 1058 - hdev->asic_funcs->halt_engines(hdev, hard_reset); 1059 - 1060 - /* Go over all the queues, release all CS and their jobs */ 1061 - hl_cs_rollback_all(hdev); 1062 - 1063 - /* Release all pending user interrupts, each pending user interrupt 1064 - * holds a reference to user context 1065 - */ 1066 - hl_release_pending_user_interrupts(hdev); 1002 + cleanup_resources(hdev, hard_reset, fw_reset); 1067 1003 1068 1004 kill_processes: 1069 1005 if (hard_reset) { ··· 1073 1057 } 1074 1058 1075 1059 /* Reset the H/W. It will be in idle state after this returns */ 1076 - hdev->asic_funcs->hw_fini(hdev, hard_reset); 1060 + hdev->asic_funcs->hw_fini(hdev, hard_reset, fw_reset); 1077 1061 1078 1062 if (hard_reset) { 1063 + hdev->fw_loader.linux_loaded = false; 1064 + 1079 1065 /* Release kernel context */ 1080 1066 if (hdev->kernel_ctx && hl_ctx_put(hdev->kernel_ctx) == 1) 1081 1067 hdev->kernel_ctx = NULL; 1068 + 1082 1069 hl_vm_fini(hdev); 1083 1070 hl_mmu_fini(hdev); 1084 1071 hl_eq_reset(hdev, &hdev->event_queue); ··· 1311 1292 if (rc) 1312 1293 goto user_interrupts_fini; 1313 1294 1295 + 1296 + /* initialize completion structure for multi CS wait */ 1297 + hl_multi_cs_completion_init(hdev); 1298 + 1314 1299 /* 1315 1300 * Initialize the H/W queues. Must be done before hw_init, because 1316 1301 * there the addresses of the kernel queue are being written to the ··· 1383 1360 } 1384 1361 1385 1362 hdev->compute_ctx = NULL; 1363 + 1364 + hdev->asic_funcs->state_dump_init(hdev); 1386 1365 1387 1366 hl_debugfs_add_device(hdev); 1388 1367 ··· 1592 1567 /* Mark device as disabled */ 1593 1568 hdev->disabled = true; 1594 1569 1595 - /* Flush anyone that is inside the critical section of enqueue 1596 - * jobs to the H/W 1597 - */ 1598 - hdev->asic_funcs->hw_queues_lock(hdev); 1599 - hdev->asic_funcs->hw_queues_unlock(hdev); 1600 - 1601 - /* Flush anyone that is inside device open */ 1602 - mutex_lock(&hdev->fpriv_list_lock); 1603 - mutex_unlock(&hdev->fpriv_list_lock); 1570 + take_release_locks(hdev); 1604 1571 1605 1572 hdev->hard_reset_pending = true; 1606 1573 1607 1574 hl_hwmon_fini(hdev); 1608 1575 1609 - device_late_fini(hdev); 1610 - 1611 - /* 1612 - * Halt the engines and disable interrupts so we won't get any more 1613 - * completions from H/W and we won't have any accesses from the 1614 - * H/W to the host machine 1615 - */ 1616 - hdev->asic_funcs->halt_engines(hdev, true); 1617 - 1618 - /* Go over all the queues, release all CS and their jobs */ 1619 - hl_cs_rollback_all(hdev); 1576 + cleanup_resources(hdev, true, false); 1620 1577 1621 1578 /* Kill processes here after CS rollback. This is because the process 1622 1579 * can't really exit until all its CSs are done, which is what we ··· 1617 1610 hl_cb_pool_fini(hdev); 1618 1611 1619 1612 /* Reset the H/W. It will be in idle state after this returns */ 1620 - hdev->asic_funcs->hw_fini(hdev, true); 1613 + hdev->asic_funcs->hw_fini(hdev, true, false); 1614 + 1615 + hdev->fw_loader.linux_loaded = false; 1621 1616 1622 1617 /* Release kernel context */ 1623 1618 if ((hdev->kernel_ctx) && (hl_ctx_put(hdev->kernel_ctx) != 1))

+32 -24

drivers/misc/habanalabs/common/firmware_if.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 3 3 /* 4 - * Copyright 2016-2019 HabanaLabs, Ltd. 4 + * Copyright 2016-2021 HabanaLabs, Ltd. 5 5 * All Rights Reserved. 6 6 */ 7 7 ··· 240 240 /* set fence to a non valid value */ 241 241 pkt->fence = cpu_to_le32(UINT_MAX); 242 242 243 - rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id, len, pkt_dma_addr); 244 - if (rc) { 245 - dev_err(hdev->dev, "Failed to send CB on CPU PQ (%d)\n", rc); 246 - goto out; 247 - } 243 + /* 244 + * The CPU queue is a synchronous queue with an effective depth of 245 + * a single entry (although it is allocated with room for multiple 246 + * entries). We lock on it using 'send_cpu_message_lock' which 247 + * serializes accesses to the CPU queue. 248 + * Which means that we don't need to lock the access to the entire H/W 249 + * queues module when submitting a JOB to the CPU queue. 250 + */ 251 + hl_hw_queue_submit_bd(hdev, queue, 0, len, pkt_dma_addr); 248 252 249 253 if (prop->fw_app_cpu_boot_dev_sts0 & CPU_BOOT_DEV_STS0_PKT_PI_ACK_EN) 250 254 expected_ack_val = queue->pi; ··· 667 663 hdev->event_queue.check_eqe_index = false; 668 664 669 665 /* Read FW application security bits again */ 670 - if (hdev->asic_prop.fw_cpu_boot_dev_sts0_valid) { 671 - hdev->asic_prop.fw_app_cpu_boot_dev_sts0 = 672 - RREG32(sts_boot_dev_sts0_reg); 673 - if (hdev->asic_prop.fw_app_cpu_boot_dev_sts0 & 666 + if (prop->fw_cpu_boot_dev_sts0_valid) { 667 + prop->fw_app_cpu_boot_dev_sts0 = RREG32(sts_boot_dev_sts0_reg); 668 + if (prop->fw_app_cpu_boot_dev_sts0 & 674 669 CPU_BOOT_DEV_STS0_EQ_INDEX_EN) 675 670 hdev->event_queue.check_eqe_index = true; 676 671 } 677 672 678 - if (hdev->asic_prop.fw_cpu_boot_dev_sts1_valid) 679 - hdev->asic_prop.fw_app_cpu_boot_dev_sts1 = 680 - RREG32(sts_boot_dev_sts1_reg); 673 + if (prop->fw_cpu_boot_dev_sts1_valid) 674 + prop->fw_app_cpu_boot_dev_sts1 = RREG32(sts_boot_dev_sts1_reg); 681 675 682 676 out: 683 677 hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev, ··· 1010 1008 } else { 1011 1009 WREG32(static_loader->kmd_msg_to_cpu_reg, KMD_MSG_GOTO_WFE); 1012 1010 msleep(static_loader->cpu_reset_wait_msec); 1011 + 1012 + /* Must clear this register in order to prevent preboot 1013 + * from reading WFE after reboot 1014 + */ 1015 + WREG32(static_loader->kmd_msg_to_cpu_reg, KMD_MSG_NA); 1013 1016 } 1014 1017 1015 1018 hdev->device_cpu_is_halted = true; ··· 1061 1054 case CPU_BOOT_STATUS_TS_INIT_FAIL: 1062 1055 dev_err(hdev->dev, 1063 1056 "Device boot progress - Thermal Sensor initialization failed\n"); 1057 + break; 1058 + case CPU_BOOT_STATUS_SECURITY_READY: 1059 + dev_err(hdev->dev, 1060 + "Device boot progress - Stuck in preboot after security initialization\n"); 1064 1061 break; 1065 1062 default: 1066 1063 dev_err(hdev->dev, ··· 1249 1238 * b. Check whether hard reset is done by boot cpu 1250 1239 * 3. FW application - a. Fetch fw application security status 1251 1240 * b. Check whether hard reset is done by fw app 1252 - * 1253 - * Preboot: 1254 - * Check security status bit (CPU_BOOT_DEV_STS0_ENABLED). If set, then- 1255 - * check security enabled bit (CPU_BOOT_DEV_STS0_SECURITY_EN) 1256 - * If set, then mark GIC controller to be disabled. 1257 1241 */ 1258 1242 prop->hard_reset_done_by_fw = 1259 1243 !!(cpu_boot_dev_sts0 & CPU_BOOT_DEV_STS0_FW_HARD_RST_EN); ··· 1959 1953 if (!hdev->asic_prop.gic_interrupts_enable && 1960 1954 !(hdev->asic_prop.fw_app_cpu_boot_dev_sts0 & 1961 1955 CPU_BOOT_DEV_STS0_MULTI_IRQ_POLL_EN)) { 1962 - dyn_regs->gic_host_halt_irq = dyn_regs->gic_host_irq_ctrl; 1963 - dyn_regs->gic_host_ints_irq = dyn_regs->gic_host_irq_ctrl; 1956 + dyn_regs->gic_host_halt_irq = dyn_regs->gic_host_pi_upd_irq; 1957 + dyn_regs->gic_host_ints_irq = dyn_regs->gic_host_pi_upd_irq; 1964 1958 1965 1959 dev_warn(hdev->dev, 1966 1960 "Using a single interrupt interface towards cpucp"); ··· 2128 2122 2129 2123 /* Read FW application security bits */ 2130 2124 if (prop->fw_cpu_boot_dev_sts0_valid) { 2131 - prop->fw_app_cpu_boot_dev_sts0 = 2132 - RREG32(cpu_boot_dev_sts0_reg); 2125 + prop->fw_app_cpu_boot_dev_sts0 = RREG32(cpu_boot_dev_sts0_reg); 2133 2126 2134 2127 if (prop->fw_app_cpu_boot_dev_sts0 & 2135 2128 CPU_BOOT_DEV_STS0_FW_HARD_RST_EN) ··· 2148 2143 } 2149 2144 2150 2145 if (prop->fw_cpu_boot_dev_sts1_valid) { 2151 - prop->fw_app_cpu_boot_dev_sts1 = 2152 - RREG32(cpu_boot_dev_sts1_reg); 2146 + prop->fw_app_cpu_boot_dev_sts1 = RREG32(cpu_boot_dev_sts1_reg); 2153 2147 2154 2148 dev_dbg(hdev->dev, 2155 2149 "Firmware application CPU status1 %#x\n", ··· 2239 2235 dev_info(hdev->dev, 2240 2236 "Loading firmware to device, may take some time...\n"); 2241 2237 2238 + /* 2239 + * In this stage, "cpu_dyn_regs" contains only LKD's hard coded values! 2240 + * It will be updated from FW after hl_fw_dynamic_request_descriptor(). 2241 + */ 2242 2242 dyn_regs = &fw_loader->dynamic_loader.comm_desc.cpu_dyn_regs; 2243 2243 2244 2244 rc = hl_fw_dynamic_send_protocol_cmd(hdev, fw_loader, COMMS_RST_STATE,

+375 -46

drivers/misc/habanalabs/common/habanalabs.h

··· 20 20 #include <linux/scatterlist.h> 21 21 #include <linux/hashtable.h> 22 22 #include <linux/debugfs.h> 23 + #include <linux/rwsem.h> 23 24 #include <linux/bitfield.h> 24 25 #include <linux/genalloc.h> 25 26 #include <linux/sched/signal.h> ··· 65 64 #define HL_SIM_MAX_TIMEOUT_US 10000000 /* 10s */ 66 65 67 66 #define HL_COMMON_USER_INTERRUPT_ID 0xFFF 67 + 68 + #define HL_STATE_DUMP_HIST_LEN 5 69 + 70 + #define OBJ_NAMES_HASH_TABLE_BITS 7 /* 1 << 7 buckets */ 71 + #define SYNC_TO_ENGINE_HASH_TABLE_BITS 7 /* 1 << 7 buckets */ 68 72 69 73 /* Memory */ 70 74 #define MEM_HASH_TABLE_BITS 7 /* 1 << 7 buckets */ ··· 128 122 * 129 123 * - HL_RESET_DEVICE_RELEASE 130 124 * Set if reset is due to device release 125 + * 126 + * - HL_RESET_FW 127 + * F/W will perform the reset. No need to ask it to reset the device. This is relevant 128 + * only when running with secured f/w 131 129 */ 132 130 #define HL_RESET_HARD (1 << 0) 133 131 #define HL_RESET_FROM_RESET_THREAD (1 << 1) 134 132 #define HL_RESET_HEARTBEAT (1 << 2) 135 133 #define HL_RESET_TDR (1 << 3) 136 134 #define HL_RESET_DEVICE_RELEASE (1 << 4) 135 + #define HL_RESET_FW (1 << 5) 137 136 138 137 #define HL_MAX_SOBS_PER_MONITOR 8 139 138 ··· 247 236 CS_TYPE_DEFAULT, 248 237 CS_TYPE_SIGNAL, 249 238 CS_TYPE_WAIT, 250 - CS_TYPE_COLLECTIVE_WAIT 239 + CS_TYPE_COLLECTIVE_WAIT, 240 + CS_RESERVE_SIGNALS, 241 + CS_UNRESERVE_SIGNALS 251 242 }; 252 243 253 244 /* ··· 294 281 * @hdev: habanalabs device structure. 295 282 * @kref: refcount of this SOB. The SOB will reset once the refcount is zero. 296 283 * @sob_id: id of this SOB. 284 + * @sob_addr: the sob offset from the base address. 297 285 * @q_idx: the H/W queue that uses this SOB. 286 + * @need_reset: reset indication set when switching to the other sob. 298 287 */ 299 288 struct hl_hw_sob { 300 289 struct hl_device *hdev; 301 290 struct kref kref; 302 291 u32 sob_id; 292 + u32 sob_addr; 303 293 u32 q_idx; 294 + bool need_reset; 304 295 }; 305 296 306 297 enum hl_collective_mode { ··· 334 317 }; 335 318 336 319 /** 337 - * enum vm_type_t - virtual memory mapping request information. 320 + * enum vm_type - virtual memory mapping request information. 338 321 * @VM_TYPE_USERPTR: mapping of user memory to device virtual address. 339 322 * @VM_TYPE_PHYS_PACK: mapping of DRAM memory to device virtual address. 340 323 */ 341 - enum vm_type_t { 324 + enum vm_type { 342 325 VM_TYPE_USERPTR = 0x1, 343 326 VM_TYPE_PHYS_PACK = 0x2 344 327 }; ··· 399 382 }; 400 383 401 384 /** 385 + * struct hl_hints_range - hint addresses reserved va range. 386 + * @start_addr: start address of the va range. 387 + * @end_addr: end address of the va range. 388 + */ 389 + struct hl_hints_range { 390 + u64 start_addr; 391 + u64 end_addr; 392 + }; 393 + 394 + /** 402 395 * struct asic_fixed_properties - ASIC specific immutable properties. 403 396 * @hw_queues_props: H/W queues properties. 404 397 * @cpucp_info: received various information from CPU-CP regarding the H/W, e.g. ··· 419 392 * @pmmu: PCI (host) MMU address translation properties. 420 393 * @pmmu_huge: PCI (host) MMU address translation properties for memory 421 394 * allocated with huge pages. 395 + * @hints_dram_reserved_va_range: dram hint addresses reserved range. 396 + * @hints_host_reserved_va_range: host hint addresses reserved range. 397 + * @hints_host_hpage_reserved_va_range: host huge page hint addresses reserved 398 + * range. 422 399 * @sram_base_address: SRAM physical start address. 423 400 * @sram_end_address: SRAM physical end address. 424 401 * @sram_user_base_address - SRAM physical start address for user access. ··· 443 412 * to the device's MMU. 444 413 * @cb_va_end_addr: virtual end address of command buffers which are mapped to 445 414 * the device's MMU. 415 + * @dram_hints_align_mask: dram va hint addresses alignment mask which is used 416 + * for hints validity check. 417 + * device_dma_offset_for_host_access: the offset to add to host DMA addresses 418 + * to enable the device to access them. 446 419 * @mmu_pgt_size: MMU page tables total size. 447 420 * @mmu_pte_size: PTE size in MMU page tables. 448 421 * @mmu_hop_table_size: MMU hop table size. ··· 494 459 * reserved for the user 495 460 * @first_available_cq: first available CQ for the user. 496 461 * @user_interrupt_count: number of user interrupts. 462 + * @server_type: Server type that the ASIC is currently installed in. 463 + * The value is according to enum hl_server_type in uapi file. 497 464 * @tpc_enabled_mask: which TPCs are enabled. 498 465 * @completion_queues_count: number of completion queues. 499 466 * @fw_security_enabled: true if security measures are enabled in firmware, ··· 507 470 * @dram_supports_virtual_memory: is there an MMU towards the DRAM 508 471 * @hard_reset_done_by_fw: true if firmware is handling hard reset flow 509 472 * @num_functional_hbms: number of functional HBMs in each DCORE. 473 + * @hints_range_reservation: device support hint addresses range reservation. 510 474 * @iatu_done_by_fw: true if iATU configuration is being done by FW. 511 475 * @dynamic_fw_load: is dynamic FW load is supported. 512 476 * @gic_interrupts_enable: true if FW is not blocking GIC controller, ··· 521 483 struct hl_mmu_properties dmmu; 522 484 struct hl_mmu_properties pmmu; 523 485 struct hl_mmu_properties pmmu_huge; 486 + struct hl_hints_range hints_dram_reserved_va_range; 487 + struct hl_hints_range hints_host_reserved_va_range; 488 + struct hl_hints_range hints_host_hpage_reserved_va_range; 524 489 u64 sram_base_address; 525 490 u64 sram_end_address; 526 491 u64 sram_user_base_address; ··· 541 500 u64 mmu_dram_default_page_addr; 542 501 u64 cb_va_start_addr; 543 502 u64 cb_va_end_addr; 503 + u64 dram_hints_align_mask; 504 + u64 device_dma_offset_for_host_access; 544 505 u32 mmu_pgt_size; 545 506 u32 mmu_pte_size; 546 507 u32 mmu_hop_table_size; ··· 577 534 u16 first_available_user_msix_interrupt; 578 535 u16 first_available_cq[HL_MAX_DCORES]; 579 536 u16 user_interrupt_count; 537 + u16 server_type; 580 538 u8 tpc_enabled_mask; 581 539 u8 completion_queues_count; 582 540 u8 fw_security_enabled; ··· 586 542 u8 dram_supports_virtual_memory; 587 543 u8 hard_reset_done_by_fw; 588 544 u8 num_functional_hbms; 545 + u8 hints_range_reservation; 589 546 u8 iatu_done_by_fw; 590 547 u8 dynamic_fw_load; 591 548 u8 gic_interrupts_enable; ··· 597 552 * @completion: fence is implemented using completion 598 553 * @refcount: refcount for this fence 599 554 * @cs_sequence: sequence of the corresponding command submission 555 + * @stream_master_qid_map: streams masters QID bitmap to represent all streams 556 + * masters QIDs that multi cs is waiting on 600 557 * @error: mark this fence with error 601 558 * @timestamp: timestamp upon completion 602 - * 603 559 */ 604 560 struct hl_fence { 605 561 struct completion completion; 606 562 struct kref refcount; 607 563 u64 cs_sequence; 564 + u32 stream_master_qid_map; 608 565 int error; 609 566 ktime_t timestamp; 610 567 }; 611 568 612 569 /** 613 570 * struct hl_cs_compl - command submission completion object. 614 - * @sob_reset_work: workqueue object to run SOB reset flow. 615 571 * @base_fence: hl fence object. 616 572 * @lock: spinlock to protect fence. 617 573 * @hdev: habanalabs device structure. 618 574 * @hw_sob: the H/W SOB used in this signal/wait CS. 575 + * @encaps_sig_hdl: encaps signals hanlder. 619 576 * @cs_seq: command submission sequence number. 620 577 * @type: type of the CS - signal/wait. 621 578 * @sob_val: the SOB value that is used in this signal/wait CS. 622 579 * @sob_group: the SOB group that is used in this collective wait CS. 580 + * @encaps_signals: indication whether it's a completion object of cs with 581 + * encaps signals or not. 623 582 */ 624 583 struct hl_cs_compl { 625 - struct work_struct sob_reset_work; 626 584 struct hl_fence base_fence; 627 585 spinlock_t lock; 628 586 struct hl_device *hdev; 629 587 struct hl_hw_sob *hw_sob; 588 + struct hl_cs_encaps_sig_handle *encaps_sig_hdl; 630 589 u64 cs_seq; 631 590 enum hl_cs_type type; 632 591 u16 sob_val; 633 592 u16 sob_group; 593 + bool encaps_signals; 634 594 }; 635 595 636 596 /* ··· 745 695 u16 collective_slave_mon_id; 746 696 u16 collective_sob_id; 747 697 u8 curr_sob_offset; 698 + }; 699 + 700 + /** 701 + * struct hl_encaps_signals_mgr - describes sync stream encapsulated signals 702 + * handlers manager 703 + * @lock: protects handles. 704 + * @handles: an idr to hold all encapsulated signals handles. 705 + */ 706 + struct hl_encaps_signals_mgr { 707 + spinlock_t lock; 708 + struct idr handles; 748 709 }; 749 710 750 711 /** ··· 936 875 u64 region_base; 937 876 u64 region_size; 938 877 u64 bar_size; 939 - u32 offset_in_bar; 878 + u64 offset_in_bar; 940 879 u8 bar_id; 941 880 u8 used; 942 881 }; ··· 1057 996 * hw_fini and before CS rollback. 1058 997 * @suspend: handles IP specific H/W or SW changes for suspend. 1059 998 * @resume: handles IP specific H/W or SW changes for resume. 1060 - * @cb_mmap: maps a CB. 999 + * @mmap: maps a memory. 1061 1000 * @ring_doorbell: increment PI on a given QMAN. 1062 1001 * @pqe_write: Write the PQ entry to the PQ. This is ASIC-specific 1063 1002 * function because the PQs are located in different memory areas ··· 1162 1101 * generic f/w compatible PLL Indexes 1163 1102 * @init_firmware_loader: initialize data for FW loader. 1164 1103 * @init_cpu_scrambler_dram: Enable CPU specific DRAM scrambling 1104 + * @state_dump_init: initialize constants required for state dump 1105 + * @get_sob_addr: get SOB base address offset. 1106 + * @set_pci_memory_regions: setting properties of PCI memory regions 1107 + * @get_stream_master_qid_arr: get pointer to stream masters QID array 1165 1108 */ 1166 1109 struct hl_asic_funcs { 1167 1110 int (*early_init)(struct hl_device *hdev); ··· 1175 1110 int (*sw_init)(struct hl_device *hdev); 1176 1111 int (*sw_fini)(struct hl_device *hdev); 1177 1112 int (*hw_init)(struct hl_device *hdev); 1178 - void (*hw_fini)(struct hl_device *hdev, bool hard_reset); 1179 - void (*halt_engines)(struct hl_device *hdev, bool hard_reset); 1113 + void (*hw_fini)(struct hl_device *hdev, bool hard_reset, bool fw_reset); 1114 + void (*halt_engines)(struct hl_device *hdev, bool hard_reset, bool fw_reset); 1180 1115 int (*suspend)(struct hl_device *hdev); 1181 1116 int (*resume)(struct hl_device *hdev); 1182 - int (*cb_mmap)(struct hl_device *hdev, struct vm_area_struct *vma, 1117 + int (*mmap)(struct hl_device *hdev, struct vm_area_struct *vma, 1183 1118 void *cpu_addr, dma_addr_t dma_addr, size_t size); 1184 1119 void (*ring_doorbell)(struct hl_device *hdev, u32 hw_queue_id, u32 pi); 1185 1120 void (*pqe_write)(struct hl_device *hdev, __le64 *pqe, ··· 1275 1210 void (*reset_sob_group)(struct hl_device *hdev, u16 sob_group); 1276 1211 void (*set_dma_mask_from_fw)(struct hl_device *hdev); 1277 1212 u64 (*get_device_time)(struct hl_device *hdev); 1278 - void (*collective_wait_init_cs)(struct hl_cs *cs); 1213 + int (*collective_wait_init_cs)(struct hl_cs *cs); 1279 1214 int (*collective_wait_create_jobs)(struct hl_device *hdev, 1280 - struct hl_ctx *ctx, struct hl_cs *cs, u32 wait_queue_id, 1281 - u32 collective_engine_id); 1215 + struct hl_ctx *ctx, struct hl_cs *cs, 1216 + u32 wait_queue_id, u32 collective_engine_id, 1217 + u32 encaps_signal_offset); 1282 1218 u64 (*scramble_addr)(struct hl_device *hdev, u64 addr); 1283 1219 u64 (*descramble_addr)(struct hl_device *hdev, u64 addr); 1284 1220 void (*ack_protection_bits_errors)(struct hl_device *hdev); ··· 1292 1226 int (*map_pll_idx_to_fw_idx)(u32 pll_idx); 1293 1227 void (*init_firmware_loader)(struct hl_device *hdev); 1294 1228 void (*init_cpu_scrambler_dram)(struct hl_device *hdev); 1229 + void (*state_dump_init)(struct hl_device *hdev); 1230 + u32 (*get_sob_addr)(struct hl_device *hdev, u32 sob_id); 1231 + void (*set_pci_memory_regions)(struct hl_device *hdev); 1232 + u32* (*get_stream_master_qid_arr)(void); 1295 1233 }; 1296 1234 1297 1235 ··· 1353 1283 }; 1354 1284 1355 1285 /** 1356 - * struct hl_pending_cb - pending command buffer structure 1357 - * @cb_node: cb node in pending cb list 1358 - * @cb: command buffer to send in next submission 1359 - * @cb_size: command buffer size 1360 - * @hw_queue_id: destination queue id 1361 - */ 1362 - struct hl_pending_cb { 1363 - struct list_head cb_node; 1364 - struct hl_cb *cb; 1365 - u32 cb_size; 1366 - u32 hw_queue_id; 1367 - }; 1368 - 1369 - /** 1370 1286 * struct hl_ctx - user/kernel context. 1371 1287 * @mem_hash: holds mapping from virtual address to virtual memory area 1372 1288 * descriptor (hl_vm_phys_pg_list or hl_userptr). ··· 1368 1312 * MMU hash or walking the PGT requires talking this lock. 1369 1313 * @hw_block_list_lock: protects the HW block memory list. 1370 1314 * @debugfs_list: node in debugfs list of contexts. 1371 - * pending_cb_list: list of pending command buffers waiting to be sent upon 1372 - * next user command submission context. 1373 1315 * @hw_block_mem_list: list of HW block virtual mapped addresses. 1374 1316 * @cs_counters: context command submission counters. 1375 1317 * @cb_va_pool: device VA pool for command buffers which are mapped to the 1376 1318 * device's MMU. 1319 + * @sig_mgr: encaps signals handle manager. 1377 1320 * @cs_sequence: sequence number for CS. Value is assigned to a CS and passed 1378 1321 * to user so user could inquire about CS. It is used as 1379 1322 * index to cs_pending array. 1380 1323 * @dram_default_hops: array that holds all hops addresses needed for default 1381 1324 * DRAM mapping. 1382 - * @pending_cb_lock: spinlock to protect pending cb list 1383 1325 * @cs_lock: spinlock to protect cs_sequence. 1384 1326 * @dram_phys_mem: amount of used physical DRAM memory by this context. 1385 1327 * @thread_ctx_switch_token: token to prevent multiple threads of the same 1386 1328 * context from running the context switch phase. 1387 1329 * Only a single thread should run it. 1388 - * @thread_pending_cb_token: token to prevent multiple threads from processing 1389 - * the pending CB list. Only a single thread should 1390 - * process the list since it is protected by a 1391 - * spinlock and we don't want to halt the entire 1392 - * command submission sequence. 1393 1330 * @thread_ctx_switch_wait_token: token to prevent the threads that didn't run 1394 1331 * the context switch phase from moving to their 1395 1332 * execution phase before the context switch phase ··· 1402 1353 struct mutex mmu_lock; 1403 1354 struct mutex hw_block_list_lock; 1404 1355 struct list_head debugfs_list; 1405 - struct list_head pending_cb_list; 1406 1356 struct list_head hw_block_mem_list; 1407 1357 struct hl_cs_counters_atomic cs_counters; 1408 1358 struct gen_pool *cb_va_pool; 1359 + struct hl_encaps_signals_mgr sig_mgr; 1409 1360 u64 cs_sequence; 1410 1361 u64 *dram_default_hops; 1411 - spinlock_t pending_cb_lock; 1412 1362 spinlock_t cs_lock; 1413 1363 atomic64_t dram_phys_mem; 1414 1364 atomic_t thread_ctx_switch_token; 1415 - atomic_t thread_pending_cb_token; 1416 1365 u32 thread_ctx_switch_wait_token; 1417 1366 u32 asid; 1418 1367 u32 handle; ··· 1441 1394 * @sgt: pointer to the scatter-gather table that holds the pages. 1442 1395 * @dir: for DMA unmapping, the direction must be supplied, so save it. 1443 1396 * @debugfs_list: node in debugfs list of command submissions. 1397 + * @pid: the pid of the user process owning the memory 1444 1398 * @addr: user-space virtual address of the start of the memory area. 1445 1399 * @size: size of the memory area to pin & map. 1446 1400 * @dma_mapped: true if the SG was mapped to DMA addresses, false otherwise. 1447 1401 */ 1448 1402 struct hl_userptr { 1449 - enum vm_type_t vm_type; /* must be first */ 1403 + enum vm_type vm_type; /* must be first */ 1450 1404 struct list_head job_node; 1451 1405 struct page **pages; 1452 1406 unsigned int npages; 1453 1407 struct sg_table *sgt; 1454 1408 enum dma_data_direction dir; 1455 1409 struct list_head debugfs_list; 1410 + pid_t pid; 1456 1411 u64 addr; 1457 - u32 size; 1412 + u64 size; 1458 1413 u8 dma_mapped; 1459 1414 }; 1460 1415 ··· 1475 1426 * @mirror_node : node in device mirror list of command submissions. 1476 1427 * @staged_cs_node: node in the staged cs list. 1477 1428 * @debugfs_list: node in debugfs list of command submissions. 1429 + * @encaps_sig_hdl: holds the encaps signals handle. 1478 1430 * @sequence: the sequence number of this CS. 1479 1431 * @staged_sequence: the sequence of the staged submission this CS is part of, 1480 1432 * relevant only if staged_cs is set. 1481 1433 * @timeout_jiffies: cs timeout in jiffies. 1482 1434 * @submission_time_jiffies: submission time of the cs 1483 1435 * @type: CS_TYPE_*. 1436 + * @encaps_sig_hdl_id: encaps signals handle id, set for the first staged cs. 1484 1437 * @submitted: true if CS was submitted to H/W. 1485 1438 * @completed: true if CS was completed by device. 1486 1439 * @timedout : true if CS was timedout. ··· 1496 1445 * @staged_cs: true if this CS is part of a staged submission. 1497 1446 * @skip_reset_on_timeout: true if we shall not reset the device in case 1498 1447 * timeout occurs (debug scenario). 1448 + * @encaps_signals: true if this CS has encaps reserved signals. 1499 1449 */ 1500 1450 struct hl_cs { 1501 1451 u16 *jobs_in_queue_cnt; ··· 1511 1459 struct list_head mirror_node; 1512 1460 struct list_head staged_cs_node; 1513 1461 struct list_head debugfs_list; 1462 + struct hl_cs_encaps_sig_handle *encaps_sig_hdl; 1514 1463 u64 sequence; 1515 1464 u64 staged_sequence; 1516 1465 u64 timeout_jiffies; 1517 1466 u64 submission_time_jiffies; 1518 1467 enum hl_cs_type type; 1468 + u32 encaps_sig_hdl_id; 1519 1469 u8 submitted; 1520 1470 u8 completed; 1521 1471 u8 timedout; ··· 1528 1474 u8 staged_first; 1529 1475 u8 staged_cs; 1530 1476 u8 skip_reset_on_timeout; 1477 + u8 encaps_signals; 1531 1478 }; 1532 1479 1533 1480 /** ··· 1548 1493 * @hw_queue_id: the id of the H/W queue this job is submitted to. 1549 1494 * @user_cb_size: the actual size of the CB we got from the user. 1550 1495 * @job_cb_size: the actual size of the CB that we put on the queue. 1496 + * @encaps_sig_wait_offset: encapsulated signals offset, which allow user 1497 + * to wait on part of the reserved signals. 1551 1498 * @is_kernel_allocated_cb: true if the CB handle we got from the user holds a 1552 1499 * handle to a kernel-allocated CB object, false 1553 1500 * otherwise (SRAM/DRAM/host address). ··· 1574 1517 u32 hw_queue_id; 1575 1518 u32 user_cb_size; 1576 1519 u32 job_cb_size; 1520 + u32 encaps_sig_wait_offset; 1577 1521 u8 is_kernel_allocated_cb; 1578 1522 u8 contains_dma_pkt; 1579 1523 }; ··· 1671 1613 * @created_from_userptr: is product of host virtual address. 1672 1614 */ 1673 1615 struct hl_vm_phys_pg_pack { 1674 - enum vm_type_t vm_type; /* must be first */ 1616 + enum vm_type vm_type; /* must be first */ 1675 1617 u64 *pages; 1676 1618 u64 npages; 1677 1619 u64 total_size; ··· 1817 1759 * @ctx_mem_hash_list: list of available contexts with MMU mappings. 1818 1760 * @ctx_mem_hash_spinlock: protects cb_list. 1819 1761 * @blob_desc: descriptor of blob 1762 + * @state_dump: data of the system states in case of a bad cs. 1763 + * @state_dump_sem: protects state_dump. 1820 1764 * @addr: next address to read/write from/to in read/write32. 1821 1765 * @mmu_addr: next virtual address to translate to physical address in mmu_show. 1766 + * @userptr_lookup: the target user ptr to look up for on demand. 1822 1767 * @mmu_asid: ASID to use while translating in mmu_show. 1768 + * @state_dump_head: index of the latest state dump 1823 1769 * @i2c_bus: generic u8 debugfs file for bus value to use in i2c_data_read. 1824 1770 * @i2c_addr: generic u8 debugfs file for address value to use in i2c_data_read. 1825 1771 * @i2c_reg: generic u8 debugfs file for register value to use in i2c_data_read. ··· 1845 1783 struct list_head ctx_mem_hash_list; 1846 1784 spinlock_t ctx_mem_hash_spinlock; 1847 1785 struct debugfs_blob_wrapper blob_desc; 1786 + char *state_dump[HL_STATE_DUMP_HIST_LEN]; 1787 + struct rw_semaphore state_dump_sem; 1848 1788 u64 addr; 1849 1789 u64 mmu_addr; 1790 + u64 userptr_lookup; 1850 1791 u32 mmu_asid; 1792 + u32 state_dump_head; 1851 1793 u8 i2c_bus; 1852 1794 u8 i2c_addr; 1853 1795 u8 i2c_reg; 1796 + }; 1797 + 1798 + /** 1799 + * struct hl_hw_obj_name_entry - single hw object name, member of 1800 + * hl_state_dump_specs 1801 + * @node: link to the containing hash table 1802 + * @name: hw object name 1803 + * @id: object identifier 1804 + */ 1805 + struct hl_hw_obj_name_entry { 1806 + struct hlist_node node; 1807 + const char *name; 1808 + u32 id; 1809 + }; 1810 + 1811 + enum hl_state_dump_specs_props { 1812 + SP_SYNC_OBJ_BASE_ADDR, 1813 + SP_NEXT_SYNC_OBJ_ADDR, 1814 + SP_SYNC_OBJ_AMOUNT, 1815 + SP_MON_OBJ_WR_ADDR_LOW, 1816 + SP_MON_OBJ_WR_ADDR_HIGH, 1817 + SP_MON_OBJ_WR_DATA, 1818 + SP_MON_OBJ_ARM_DATA, 1819 + SP_MON_OBJ_STATUS, 1820 + SP_MONITORS_AMOUNT, 1821 + SP_TPC0_CMDQ, 1822 + SP_TPC0_CFG_SO, 1823 + SP_NEXT_TPC, 1824 + SP_MME_CMDQ, 1825 + SP_MME_CFG_SO, 1826 + SP_NEXT_MME, 1827 + SP_DMA_CMDQ, 1828 + SP_DMA_CFG_SO, 1829 + SP_DMA_QUEUES_OFFSET, 1830 + SP_NUM_OF_MME_ENGINES, 1831 + SP_SUB_MME_ENG_NUM, 1832 + SP_NUM_OF_DMA_ENGINES, 1833 + SP_NUM_OF_TPC_ENGINES, 1834 + SP_ENGINE_NUM_OF_QUEUES, 1835 + SP_ENGINE_NUM_OF_STREAMS, 1836 + SP_ENGINE_NUM_OF_FENCES, 1837 + SP_FENCE0_CNT_OFFSET, 1838 + SP_FENCE0_RDATA_OFFSET, 1839 + SP_CP_STS_OFFSET, 1840 + SP_NUM_CORES, 1841 + 1842 + SP_MAX 1843 + }; 1844 + 1845 + enum hl_sync_engine_type { 1846 + ENGINE_TPC, 1847 + ENGINE_DMA, 1848 + ENGINE_MME, 1849 + }; 1850 + 1851 + /** 1852 + * struct hl_mon_state_dump - represents a state dump of a single monitor 1853 + * @id: monitor id 1854 + * @wr_addr_low: address monitor will write to, low bits 1855 + * @wr_addr_high: address monitor will write to, high bits 1856 + * @wr_data: data monitor will write 1857 + * @arm_data: register value containing monitor configuration 1858 + * @status: monitor status 1859 + */ 1860 + struct hl_mon_state_dump { 1861 + u32 id; 1862 + u32 wr_addr_low; 1863 + u32 wr_addr_high; 1864 + u32 wr_data; 1865 + u32 arm_data; 1866 + u32 status; 1867 + }; 1868 + 1869 + /** 1870 + * struct hl_sync_to_engine_map_entry - sync object id to engine mapping entry 1871 + * @engine_type: type of the engine 1872 + * @engine_id: id of the engine 1873 + * @sync_id: id of the sync object 1874 + */ 1875 + struct hl_sync_to_engine_map_entry { 1876 + struct hlist_node node; 1877 + enum hl_sync_engine_type engine_type; 1878 + u32 engine_id; 1879 + u32 sync_id; 1880 + }; 1881 + 1882 + /** 1883 + * struct hl_sync_to_engine_map - maps sync object id to associated engine id 1884 + * @tb: hash table containing the mapping, each element is of type 1885 + * struct hl_sync_to_engine_map_entry 1886 + */ 1887 + struct hl_sync_to_engine_map { 1888 + DECLARE_HASHTABLE(tb, SYNC_TO_ENGINE_HASH_TABLE_BITS); 1889 + }; 1890 + 1891 + /** 1892 + * struct hl_state_dump_specs_funcs - virtual functions used by the state dump 1893 + * @gen_sync_to_engine_map: generate a hash map from sync obj id to its engine 1894 + * @print_single_monitor: format monitor data as string 1895 + * @monitor_valid: return true if given monitor dump is valid 1896 + * @print_fences_single_engine: format fences data as string 1897 + */ 1898 + struct hl_state_dump_specs_funcs { 1899 + int (*gen_sync_to_engine_map)(struct hl_device *hdev, 1900 + struct hl_sync_to_engine_map *map); 1901 + int (*print_single_monitor)(char **buf, size_t *size, size_t *offset, 1902 + struct hl_device *hdev, 1903 + struct hl_mon_state_dump *mon); 1904 + int (*monitor_valid)(struct hl_mon_state_dump *mon); 1905 + int (*print_fences_single_engine)(struct hl_device *hdev, 1906 + u64 base_offset, 1907 + u64 status_base_offset, 1908 + enum hl_sync_engine_type engine_type, 1909 + u32 engine_id, char **buf, 1910 + size_t *size, size_t *offset); 1911 + }; 1912 + 1913 + /** 1914 + * struct hl_state_dump_specs - defines ASIC known hw objects names 1915 + * @so_id_to_str_tb: sync objects names index table 1916 + * @monitor_id_to_str_tb: monitors names index table 1917 + * @funcs: virtual functions used for state dump 1918 + * @sync_namager_names: readable names for sync manager if available (ex: N_E) 1919 + * @props: pointer to a per asic const props array required for state dump 1920 + */ 1921 + struct hl_state_dump_specs { 1922 + DECLARE_HASHTABLE(so_id_to_str_tb, OBJ_NAMES_HASH_TABLE_BITS); 1923 + DECLARE_HASHTABLE(monitor_id_to_str_tb, OBJ_NAMES_HASH_TABLE_BITS); 1924 + struct hl_state_dump_specs_funcs funcs; 1925 + const char * const *sync_namager_names; 1926 + s64 *props; 1854 1927 }; 1855 1928 1856 1929 ··· 1995 1798 1996 1799 #define HL_STR_MAX 32 1997 1800 1998 - #define HL_DEV_STS_MAX (HL_DEVICE_STATUS_NEEDS_RESET + 1) 1801 + #define HL_DEV_STS_MAX (HL_DEVICE_STATUS_LAST + 1) 1999 1802 2000 1803 /* Theoretical limit only. A single host can only contain up to 4 or 8 PCIe 2001 1804 * x16 cards. In extreme cases, there are hosts that can accommodate 16 cards. ··· 2143 1946 * @wq: work queue for device reset procedure. 2144 1947 * @reset_work: reset work to be done. 2145 1948 * @hdev: habanalabs device structure. 1949 + * @fw_reset: whether f/w will do the reset without us sending them a message to do it. 2146 1950 */ 2147 1951 struct hl_device_reset_work { 2148 1952 struct workqueue_struct *wq; 2149 1953 struct delayed_work reset_work; 2150 1954 struct hl_device *hdev; 1955 + bool fw_reset; 2151 1956 }; 2152 1957 2153 1958 /** ··· 2264 2065 }; 2265 2066 2266 2067 /** 2068 + * number of user contexts allowed to call wait_for_multi_cs ioctl in 2069 + * parallel 2070 + */ 2071 + #define MULTI_CS_MAX_USER_CTX 2 2072 + 2073 + /** 2074 + * struct multi_cs_completion - multi CS wait completion. 2075 + * @completion: completion of any of the CS in the list 2076 + * @lock: spinlock for the completion structure 2077 + * @timestamp: timestamp for the multi-CS completion 2078 + * @stream_master_qid_map: bitmap of all stream masters on which the multi-CS 2079 + * is waiting 2080 + * @used: 1 if in use, otherwise 0 2081 + */ 2082 + struct multi_cs_completion { 2083 + struct completion completion; 2084 + spinlock_t lock; 2085 + s64 timestamp; 2086 + u32 stream_master_qid_map; 2087 + u8 used; 2088 + }; 2089 + 2090 + /** 2091 + * struct multi_cs_data - internal data for multi CS call 2092 + * @ctx: pointer to the context structure 2093 + * @fence_arr: array of fences of all CSs 2094 + * @seq_arr: array of CS sequence numbers 2095 + * @timeout_us: timeout in usec for waiting for CS to complete 2096 + * @timestamp: timestamp of first completed CS 2097 + * @wait_status: wait for CS status 2098 + * @completion_bitmap: bitmap of completed CSs (1- completed, otherwise 0) 2099 + * @stream_master_qid_map: bitmap of all stream master QIDs on which the 2100 + * multi-CS is waiting 2101 + * @arr_len: fence_arr and seq_arr array length 2102 + * @gone_cs: indication of gone CS (1- there was gone CS, otherwise 0) 2103 + * @update_ts: update timestamp. 1- update the timestamp, otherwise 0. 2104 + */ 2105 + struct multi_cs_data { 2106 + struct hl_ctx *ctx; 2107 + struct hl_fence **fence_arr; 2108 + u64 *seq_arr; 2109 + s64 timeout_us; 2110 + s64 timestamp; 2111 + long wait_status; 2112 + u32 completion_bitmap; 2113 + u32 stream_master_qid_map; 2114 + u8 arr_len; 2115 + u8 gone_cs; 2116 + u8 update_ts; 2117 + }; 2118 + 2119 + /** 2267 2120 * struct hl_device - habanalabs device structure. 2268 2121 * @pdev: pointer to PCI device, can be NULL in case of simulator device. 2269 2122 * @pcie_bar_phys: array of available PCIe bars physical addresses. ··· 2380 2129 * @mmu_func: device-related MMU functions. 2381 2130 * @fw_loader: FW loader manager. 2382 2131 * @pci_mem_region: array of memory regions in the PCI 2132 + * @state_dump_specs: constants and dictionaries needed to dump system state. 2133 + * @multi_cs_completion: array of multi-CS completion. 2383 2134 * @dram_used_mem: current DRAM memory consumption. 2384 2135 * @timeout_jiffies: device CS timeout value. 2385 2136 * @max_power: the max power of the device, as configured by the sysadmin. This ··· 2458 2205 * halted. We can't halt it again because the COMMS 2459 2206 * protocol will throw an error. Relevant only for 2460 2207 * cases where Linux was not loaded to device CPU 2208 + * @supports_wait_for_multi_cs: true if wait for multi CS is supported 2461 2209 */ 2462 2210 struct hl_device { 2463 2211 struct pci_dev *pdev; ··· 2527 2273 2528 2274 struct pci_mem_region pci_mem_region[PCI_REGION_NUMBER]; 2529 2275 2276 + struct hl_state_dump_specs state_dump_specs; 2277 + 2278 + struct multi_cs_completion multi_cs_completion[ 2279 + MULTI_CS_MAX_USER_CTX]; 2280 + u32 *stream_master_qid_arr; 2530 2281 atomic64_t dram_used_mem; 2531 2282 u64 timeout_jiffies; 2532 2283 u64 max_power; ··· 2581 2322 u8 curr_reset_cause; 2582 2323 u8 skip_reset_on_timeout; 2583 2324 u8 device_cpu_is_halted; 2325 + u8 supports_wait_for_multi_cs; 2326 + u8 stream_master_qid_arr_size; 2584 2327 2585 2328 /* Parameters for bring-up */ 2586 2329 u64 nic_ports_mask; ··· 2603 2342 u8 reset_if_device_not_idle; 2604 2343 }; 2605 2344 2345 + 2346 + /** 2347 + * struct hl_cs_encaps_sig_handle - encapsulated signals handle structure 2348 + * @refcount: refcount used to protect removing this id when several 2349 + * wait cs are used to wait of the reserved encaps signals. 2350 + * @hdev: pointer to habanalabs device structure. 2351 + * @hw_sob: pointer to H/W SOB used in the reservation. 2352 + * @cs_seq: staged cs sequence which contains encapsulated signals 2353 + * @id: idr handler id to be used to fetch the handler info 2354 + * @q_idx: stream queue index 2355 + * @pre_sob_val: current SOB value before reservation 2356 + * @count: signals number 2357 + */ 2358 + struct hl_cs_encaps_sig_handle { 2359 + struct kref refcount; 2360 + struct hl_device *hdev; 2361 + struct hl_hw_sob *hw_sob; 2362 + u64 cs_seq; 2363 + u32 id; 2364 + u32 q_idx; 2365 + u32 pre_sob_val; 2366 + u32 count; 2367 + }; 2606 2368 2607 2369 /* 2608 2370 * IOCTLs ··· 2655 2371 /* 2656 2372 * Kernel module functions that can be accessed by entire module 2657 2373 */ 2374 + 2375 + /** 2376 + * hl_get_sg_info() - get number of pages and the DMA address from SG list. 2377 + * @sg: the SG list. 2378 + * @dma_addr: pointer to DMA address to return. 2379 + * 2380 + * Calculate the number of consecutive pages described by the SG list. Take the 2381 + * offset of the address in the first page, add to it the length and round it up 2382 + * to the number of needed pages. 2383 + */ 2384 + static inline u32 hl_get_sg_info(struct scatterlist *sg, dma_addr_t *dma_addr) 2385 + { 2386 + *dma_addr = sg_dma_address(sg); 2387 + 2388 + return ((((*dma_addr) & (PAGE_SIZE - 1)) + sg_dma_len(sg)) + 2389 + (PAGE_SIZE - 1)) >> PAGE_SHIFT; 2390 + } 2658 2391 2659 2392 /** 2660 2393 * hl_mem_area_inside_range() - Checks whether address+size are inside a range. ··· 2737 2436 int hl_hw_queues_create(struct hl_device *hdev); 2738 2437 void hl_hw_queues_destroy(struct hl_device *hdev); 2739 2438 int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id, 2740 - u32 cb_size, u64 cb_ptr); 2439 + u32 cb_size, u64 cb_ptr); 2440 + void hl_hw_queue_submit_bd(struct hl_device *hdev, struct hl_hw_queue *q, 2441 + u32 ctl, u32 len, u64 ptr); 2741 2442 int hl_hw_queue_schedule_cs(struct hl_cs *cs); 2742 2443 u32 hl_hw_queue_add_ptr(u32 ptr, u16 val); 2743 2444 void hl_hw_queue_inc_ci_kernel(struct hl_device *hdev, u32 hw_queue_id); ··· 2773 2470 void hl_ctx_get(struct hl_device *hdev, struct hl_ctx *ctx); 2774 2471 int hl_ctx_put(struct hl_ctx *ctx); 2775 2472 struct hl_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq); 2473 + int hl_ctx_get_fences(struct hl_ctx *ctx, u64 *seq_arr, 2474 + struct hl_fence **fence, u32 arr_len); 2776 2475 void hl_ctx_mgr_init(struct hl_ctx_mgr *mgr); 2777 2476 void hl_ctx_mgr_fini(struct hl_device *hdev, struct hl_ctx_mgr *mgr); 2778 2477 ··· 2816 2511 void hl_cb_va_pool_fini(struct hl_ctx *ctx); 2817 2512 2818 2513 void hl_cs_rollback_all(struct hl_device *hdev); 2819 - void hl_pending_cb_list_flush(struct hl_ctx *ctx); 2820 2514 struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev, 2821 2515 enum hl_queue_type queue_type, bool is_kernel_allocated_cb); 2822 2516 void hl_sob_reset_error(struct kref *ref); 2823 2517 int hl_gen_sob_mask(u16 sob_base, u8 sob_mask, u8 *mask); 2824 2518 void hl_fence_put(struct hl_fence *fence); 2519 + void hl_fences_put(struct hl_fence **fence, int len); 2825 2520 void hl_fence_get(struct hl_fence *fence); 2826 2521 void cs_get(struct hl_cs *cs); 2827 2522 bool cs_needs_completion(struct hl_cs *cs); 2828 2523 bool cs_needs_timeout(struct hl_cs *cs); 2829 2524 bool is_staged_cs_last_exists(struct hl_device *hdev, struct hl_cs *cs); 2830 2525 struct hl_cs *hl_staged_cs_find_first(struct hl_device *hdev, u64 cs_seq); 2526 + void hl_multi_cs_completion_init(struct hl_device *hdev); 2831 2527 2832 2528 void goya_set_asic_funcs(struct hl_device *hdev); 2833 2529 void gaudi_set_asic_funcs(struct hl_device *hdev); ··· 2956 2650 int sensor_index, u32 attr, long value); 2957 2651 int hl_set_current(struct hl_device *hdev, 2958 2652 int sensor_index, u32 attr, long value); 2653 + void hw_sob_get(struct hl_hw_sob *hw_sob); 2654 + void hw_sob_put(struct hl_hw_sob *hw_sob); 2655 + void hl_encaps_handle_do_release(struct kref *ref); 2656 + void hl_hw_queue_encaps_sig_set_sob_info(struct hl_device *hdev, 2657 + struct hl_cs *cs, struct hl_cs_job *job, 2658 + struct hl_cs_compl *cs_cmpl); 2959 2659 void hl_release_pending_user_interrupts(struct hl_device *hdev); 2960 2660 int hl_cs_signal_sob_wraparound_handler(struct hl_device *hdev, u32 q_idx, 2961 - struct hl_hw_sob **hw_sob, u32 count); 2661 + struct hl_hw_sob **hw_sob, u32 count, bool encaps_sig); 2662 + 2663 + int hl_state_dump(struct hl_device *hdev); 2664 + const char *hl_state_dump_get_sync_name(struct hl_device *hdev, u32 sync_id); 2665 + const char *hl_state_dump_get_monitor_name(struct hl_device *hdev, 2666 + struct hl_mon_state_dump *mon); 2667 + void hl_state_dump_free_sync_to_engine_map(struct hl_sync_to_engine_map *map); 2668 + __printf(4, 5) int hl_snprintf_resize(char **buf, size_t *size, size_t *offset, 2669 + const char *format, ...); 2670 + char *hl_format_as_binary(char *buf, size_t buf_len, u32 n); 2671 + const char *hl_sync_engine_to_string(enum hl_sync_engine_type engine_type); 2962 2672 2963 2673 #ifdef CONFIG_DEBUG_FS 2964 2674 ··· 2995 2673 struct hl_userptr *userptr); 2996 2674 void hl_debugfs_add_ctx_mem_hash(struct hl_device *hdev, struct hl_ctx *ctx); 2997 2675 void hl_debugfs_remove_ctx_mem_hash(struct hl_device *hdev, struct hl_ctx *ctx); 2676 + void hl_debugfs_set_state_dump(struct hl_device *hdev, char *data, 2677 + unsigned long length); 2998 2678 2999 2679 #else 3000 2680 ··· 3067 2743 3068 2744 static inline void hl_debugfs_remove_ctx_mem_hash(struct hl_device *hdev, 3069 2745 struct hl_ctx *ctx) 2746 + { 2747 + } 2748 + 2749 + static inline void hl_debugfs_set_state_dump(struct hl_device *hdev, 2750 + char *data, unsigned long length) 3070 2751 { 3071 2752 } 3072 2753

+8 -5

drivers/misc/habanalabs/common/habanalabs_drv.c

··· 141 141 hl_cb_mgr_init(&hpriv->cb_mgr); 142 142 hl_ctx_mgr_init(&hpriv->ctx_mgr); 143 143 144 - hpriv->taskpid = find_get_pid(current->pid); 144 + hpriv->taskpid = get_task_pid(current, PIDTYPE_PID); 145 145 146 146 mutex_lock(&hdev->fpriv_list_lock); 147 147 ··· 194 194 195 195 out_err: 196 196 mutex_unlock(&hdev->fpriv_list_lock); 197 - 198 197 hl_cb_mgr_fini(hpriv->hdev, &hpriv->cb_mgr); 199 198 hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr); 200 199 filp->private_data = NULL; ··· 317 318 hdev->asic_prop.fw_security_enabled = false; 318 319 319 320 /* Assign status description string */ 320 - strncpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION], 321 - "disabled", HL_STR_MAX); 321 + strncpy(hdev->status[HL_DEVICE_STATUS_OPERATIONAL], 322 + "operational", HL_STR_MAX); 322 323 strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET], 323 324 "in reset", HL_STR_MAX); 325 + strncpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION], 326 + "disabled", HL_STR_MAX); 324 327 strncpy(hdev->status[HL_DEVICE_STATUS_NEEDS_RESET], 325 328 "needs reset", HL_STR_MAX); 329 + strncpy(hdev->status[HL_DEVICE_STATUS_IN_DEVICE_CREATION], 330 + "in device creation", HL_STR_MAX); 326 331 327 332 hdev->major = hl_major; 328 333 hdev->reset_on_lockup = reset_on_lockup; ··· 535 532 result = PCI_ERS_RESULT_NONE; 536 533 } 537 534 538 - hdev->asic_funcs->halt_engines(hdev, true); 535 + hdev->asic_funcs->halt_engines(hdev, true, false); 539 536 540 537 return result; 541 538 }

+2

drivers/misc/habanalabs/common/habanalabs_ioctl.c

··· 94 94 95 95 hw_ip.first_available_interrupt_id = 96 96 prop->first_available_user_msix_interrupt; 97 + hw_ip.server_type = prop->server_type; 98 + 97 99 return copy_to_user(out, &hw_ip, 98 100 min((size_t) size, sizeof(hw_ip))) ? -EFAULT : 0; 99 101 }

+155 -43

drivers/misc/habanalabs/common/hw_queue.c

··· 65 65 } 66 66 67 67 /* 68 - * ext_and_hw_queue_submit_bd() - Submit a buffer descriptor to an external or a 68 + * hl_hw_queue_submit_bd() - Submit a buffer descriptor to an external or a 69 69 * H/W queue. 70 70 * @hdev: pointer to habanalabs device structure 71 71 * @q: pointer to habanalabs queue structure ··· 80 80 * This function must be called when the scheduler mutex is taken 81 81 * 82 82 */ 83 - static void ext_and_hw_queue_submit_bd(struct hl_device *hdev, 84 - struct hl_hw_queue *q, u32 ctl, u32 len, u64 ptr) 83 + void hl_hw_queue_submit_bd(struct hl_device *hdev, struct hl_hw_queue *q, 84 + u32 ctl, u32 len, u64 ptr) 85 85 { 86 86 struct hl_bd *bd; 87 87 ··· 222 222 * @cb_size: size of CB 223 223 * @cb_ptr: pointer to CB location 224 224 * 225 - * This function sends a single CB, that must NOT generate a completion entry 226 - * 225 + * This function sends a single CB, that must NOT generate a completion entry. 226 + * Sending CPU messages can be done instead via 'hl_hw_queue_submit_bd()' 227 227 */ 228 228 int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id, 229 229 u32 cb_size, u64 cb_ptr) ··· 231 231 struct hl_hw_queue *q = &hdev->kernel_queues[hw_queue_id]; 232 232 int rc = 0; 233 233 234 - /* 235 - * The CPU queue is a synchronous queue with an effective depth of 236 - * a single entry (although it is allocated with room for multiple 237 - * entries). Therefore, there is a different lock, called 238 - * send_cpu_message_lock, that serializes accesses to the CPU queue. 239 - * As a result, we don't need to lock the access to the entire H/W 240 - * queues module when submitting a JOB to the CPU queue 241 - */ 242 - if (q->queue_type != QUEUE_TYPE_CPU) 243 - hdev->asic_funcs->hw_queues_lock(hdev); 234 + hdev->asic_funcs->hw_queues_lock(hdev); 244 235 245 236 if (hdev->disabled) { 246 237 rc = -EPERM; ··· 249 258 goto out; 250 259 } 251 260 252 - ext_and_hw_queue_submit_bd(hdev, q, 0, cb_size, cb_ptr); 261 + hl_hw_queue_submit_bd(hdev, q, 0, cb_size, cb_ptr); 253 262 254 263 out: 255 - if (q->queue_type != QUEUE_TYPE_CPU) 256 - hdev->asic_funcs->hw_queues_unlock(hdev); 264 + hdev->asic_funcs->hw_queues_unlock(hdev); 257 265 258 266 return rc; 259 267 } ··· 318 328 cq->pi = hl_cq_inc_ptr(cq->pi); 319 329 320 330 submit_bd: 321 - ext_and_hw_queue_submit_bd(hdev, q, ctl, len, ptr); 331 + hl_hw_queue_submit_bd(hdev, q, ctl, len, ptr); 322 332 } 323 333 324 334 /* ··· 397 407 else 398 408 ptr = (u64) (uintptr_t) job->user_cb; 399 409 400 - ext_and_hw_queue_submit_bd(hdev, q, ctl, len, ptr); 410 + hl_hw_queue_submit_bd(hdev, q, ctl, len, ptr); 401 411 } 402 412 403 413 static int init_signal_cs(struct hl_device *hdev, ··· 416 426 cs_cmpl->sob_val = prop->next_sob_val; 417 427 418 428 dev_dbg(hdev->dev, 419 - "generate signal CB, sob_id: %d, sob val: 0x%x, q_idx: %d\n", 420 - cs_cmpl->hw_sob->sob_id, cs_cmpl->sob_val, q_idx); 429 + "generate signal CB, sob_id: %d, sob val: %u, q_idx: %d, seq: %llu\n", 430 + cs_cmpl->hw_sob->sob_id, cs_cmpl->sob_val, q_idx, 431 + cs_cmpl->cs_seq); 421 432 422 433 /* we set an EB since we must make sure all oeprations are done 423 434 * when sending the signal ··· 426 435 hdev->asic_funcs->gen_signal_cb(hdev, job->patched_cb, 427 436 cs_cmpl->hw_sob->sob_id, 0, true); 428 437 429 - rc = hl_cs_signal_sob_wraparound_handler(hdev, q_idx, &hw_sob, 1); 438 + rc = hl_cs_signal_sob_wraparound_handler(hdev, q_idx, &hw_sob, 1, 439 + false); 430 440 431 441 return rc; 432 442 } 433 443 434 - static void init_wait_cs(struct hl_device *hdev, struct hl_cs *cs, 444 + void hl_hw_queue_encaps_sig_set_sob_info(struct hl_device *hdev, 445 + struct hl_cs *cs, struct hl_cs_job *job, 446 + struct hl_cs_compl *cs_cmpl) 447 + { 448 + struct hl_cs_encaps_sig_handle *handle = cs->encaps_sig_hdl; 449 + 450 + cs_cmpl->hw_sob = handle->hw_sob; 451 + 452 + /* Note that encaps_sig_wait_offset was validated earlier in the flow 453 + * for offset value which exceeds the max reserved signal count. 454 + * always decrement 1 of the offset since when the user 455 + * set offset 1 for example he mean to wait only for the first 456 + * signal only, which will be pre_sob_val, and if he set offset 2 457 + * then the value required is (pre_sob_val + 1) and so on... 458 + */ 459 + cs_cmpl->sob_val = handle->pre_sob_val + 460 + (job->encaps_sig_wait_offset - 1); 461 + } 462 + 463 + static int init_wait_cs(struct hl_device *hdev, struct hl_cs *cs, 435 464 struct hl_cs_job *job, struct hl_cs_compl *cs_cmpl) 436 465 { 437 - struct hl_cs_compl *signal_cs_cmpl; 438 - struct hl_sync_stream_properties *prop; 439 466 struct hl_gen_wait_properties wait_prop; 467 + struct hl_sync_stream_properties *prop; 468 + struct hl_cs_compl *signal_cs_cmpl; 440 469 u32 q_idx; 441 470 442 471 q_idx = job->hw_queue_id; ··· 466 455 struct hl_cs_compl, 467 456 base_fence); 468 457 469 - /* copy the SOB id and value of the signal CS */ 470 - cs_cmpl->hw_sob = signal_cs_cmpl->hw_sob; 471 - cs_cmpl->sob_val = signal_cs_cmpl->sob_val; 458 + if (cs->encaps_signals) { 459 + /* use the encaps signal handle stored earlier in the flow 460 + * and set the SOB information from the encaps 461 + * signals handle 462 + */ 463 + hl_hw_queue_encaps_sig_set_sob_info(hdev, cs, job, cs_cmpl); 464 + 465 + dev_dbg(hdev->dev, "Wait for encaps signals handle, qidx(%u), CS sequence(%llu), sob val: 0x%x, offset: %u\n", 466 + cs->encaps_sig_hdl->q_idx, 467 + cs->encaps_sig_hdl->cs_seq, 468 + cs_cmpl->sob_val, 469 + job->encaps_sig_wait_offset); 470 + } else { 471 + /* Copy the SOB id and value of the signal CS */ 472 + cs_cmpl->hw_sob = signal_cs_cmpl->hw_sob; 473 + cs_cmpl->sob_val = signal_cs_cmpl->sob_val; 474 + } 475 + 476 + /* check again if the signal cs already completed. 477 + * if yes then don't send any wait cs since the hw_sob 478 + * could be in reset already. if signal is not completed 479 + * then get refcount to hw_sob to prevent resetting the sob 480 + * while wait cs is not submitted. 481 + * note that this check is protected by two locks, 482 + * hw queue lock and completion object lock, 483 + * and the same completion object lock also protects 484 + * the hw_sob reset handler function. 485 + * The hw_queue lock prevent out of sync of hw_sob 486 + * refcount value, changed by signal/wait flows. 487 + */ 488 + spin_lock(&signal_cs_cmpl->lock); 489 + 490 + if (completion_done(&cs->signal_fence->completion)) { 491 + spin_unlock(&signal_cs_cmpl->lock); 492 + return -EINVAL; 493 + } 494 + 495 + kref_get(&cs_cmpl->hw_sob->kref); 496 + 497 + spin_unlock(&signal_cs_cmpl->lock); 472 498 473 499 dev_dbg(hdev->dev, 474 - "generate wait CB, sob_id: %d, sob_val: 0x%x, mon_id: %d, q_idx: %d\n", 500 + "generate wait CB, sob_id: %d, sob_val: 0x%x, mon_id: %d, q_idx: %d, seq: %llu\n", 475 501 cs_cmpl->hw_sob->sob_id, cs_cmpl->sob_val, 476 - prop->base_mon_id, q_idx); 502 + prop->base_mon_id, q_idx, cs->sequence); 477 503 478 504 wait_prop.data = (void *) job->patched_cb; 479 505 wait_prop.sob_base = cs_cmpl->hw_sob->sob_id; ··· 519 471 wait_prop.mon_id = prop->base_mon_id; 520 472 wait_prop.q_idx = q_idx; 521 473 wait_prop.size = 0; 474 + 522 475 hdev->asic_funcs->gen_wait_cb(hdev, &wait_prop); 523 476 524 - kref_get(&cs_cmpl->hw_sob->kref); 525 - /* 526 - * Must put the signal fence after the SOB refcnt increment so 527 - * the SOB refcnt won't turn 0 and reset the SOB before the 528 - * wait CS was submitted. 529 - */ 530 477 mb(); 531 478 hl_fence_put(cs->signal_fence); 532 479 cs->signal_fence = NULL; 480 + 481 + return 0; 533 482 } 534 483 535 484 /* ··· 551 506 if (cs->type & CS_TYPE_SIGNAL) 552 507 rc = init_signal_cs(hdev, job, cs_cmpl); 553 508 else if (cs->type & CS_TYPE_WAIT) 554 - init_wait_cs(hdev, cs, job, cs_cmpl); 509 + rc = init_wait_cs(hdev, cs, job, cs_cmpl); 510 + 511 + return rc; 512 + } 513 + 514 + static int encaps_sig_first_staged_cs_handler 515 + (struct hl_device *hdev, struct hl_cs *cs) 516 + { 517 + struct hl_cs_compl *cs_cmpl = 518 + container_of(cs->fence, 519 + struct hl_cs_compl, base_fence); 520 + struct hl_cs_encaps_sig_handle *encaps_sig_hdl; 521 + struct hl_encaps_signals_mgr *mgr; 522 + int rc = 0; 523 + 524 + mgr = &hdev->compute_ctx->sig_mgr; 525 + 526 + spin_lock(&mgr->lock); 527 + encaps_sig_hdl = idr_find(&mgr->handles, cs->encaps_sig_hdl_id); 528 + if (encaps_sig_hdl) { 529 + /* 530 + * Set handler CS sequence, 531 + * the CS which contains the encapsulated signals. 532 + */ 533 + encaps_sig_hdl->cs_seq = cs->sequence; 534 + /* store the handle and set encaps signal indication, 535 + * to be used later in cs_do_release to put the last 536 + * reference to encaps signals handlers. 537 + */ 538 + cs_cmpl->encaps_signals = true; 539 + cs_cmpl->encaps_sig_hdl = encaps_sig_hdl; 540 + 541 + /* set hw_sob pointer in completion object 542 + * since it's used in cs_do_release flow to put 543 + * refcount to sob 544 + */ 545 + cs_cmpl->hw_sob = encaps_sig_hdl->hw_sob; 546 + cs_cmpl->sob_val = encaps_sig_hdl->pre_sob_val + 547 + encaps_sig_hdl->count; 548 + 549 + dev_dbg(hdev->dev, "CS seq (%llu) added to encaps signal handler id (%u), count(%u), qidx(%u), sob(%u), val(%u)\n", 550 + cs->sequence, encaps_sig_hdl->id, 551 + encaps_sig_hdl->count, 552 + encaps_sig_hdl->q_idx, 553 + cs_cmpl->hw_sob->sob_id, 554 + cs_cmpl->sob_val); 555 + 556 + } else { 557 + dev_err(hdev->dev, "encaps handle id(%u) wasn't found!\n", 558 + cs->encaps_sig_hdl_id); 559 + rc = -EINVAL; 560 + } 561 + 562 + spin_unlock(&mgr->lock); 555 563 556 564 return rc; 557 565 } ··· 679 581 680 582 if ((cs->type == CS_TYPE_SIGNAL) || (cs->type == CS_TYPE_WAIT)) { 681 583 rc = init_signal_wait_cs(cs); 682 - if (rc) { 683 - dev_err(hdev->dev, "Failed to submit signal cs\n"); 584 + if (rc) 684 585 goto unroll_cq_resv; 685 - } 686 - } else if (cs->type == CS_TYPE_COLLECTIVE_WAIT) 687 - hdev->asic_funcs->collective_wait_init_cs(cs); 586 + } else if (cs->type == CS_TYPE_COLLECTIVE_WAIT) { 587 + rc = hdev->asic_funcs->collective_wait_init_cs(cs); 588 + if (rc) 589 + goto unroll_cq_resv; 590 + } 688 591 592 + 593 + if (cs->encaps_signals && cs->staged_first) { 594 + rc = encaps_sig_first_staged_cs_handler(hdev, cs); 595 + if (rc) 596 + goto unroll_cq_resv; 597 + } 689 598 690 599 spin_lock(&hdev->cs_mirror_lock); 691 600 ··· 718 613 } 719 614 720 615 list_add_tail(&cs->staged_cs_node, &staged_cs->staged_cs_node); 616 + 617 + /* update stream map of the first CS */ 618 + if (hdev->supports_wait_for_multi_cs) 619 + staged_cs->fence->stream_master_qid_map |= 620 + cs->fence->stream_master_qid_map; 721 621 } 722 622 723 623 list_add_tail(&cs->mirror_node, &hdev->cs_mirror_list); ··· 944 834 hw_sob = &sync_stream_prop->hw_sob[sob]; 945 835 hw_sob->hdev = hdev; 946 836 hw_sob->sob_id = sync_stream_prop->base_sob_id + sob; 837 + hw_sob->sob_addr = 838 + hdev->asic_funcs->get_sob_addr(hdev, hw_sob->sob_id); 947 839 hw_sob->q_idx = q_idx; 948 840 kref_init(&hw_sob->kref); 949 841 }

+124 -47

drivers/misc/habanalabs/common/memory.c

··· 124 124 125 125 spin_lock(&vm->idr_lock); 126 126 handle = idr_alloc(&vm->phys_pg_pack_handles, phys_pg_pack, 1, 0, 127 - GFP_KERNEL); 127 + GFP_ATOMIC); 128 128 spin_unlock(&vm->idr_lock); 129 129 130 130 if (handle < 0) { ··· 529 529 } 530 530 531 531 /** 532 + * is_hint_crossing_range() - check if hint address crossing specified reserved 533 + * range. 534 + */ 535 + static inline bool is_hint_crossing_range(enum hl_va_range_type range_type, 536 + u64 start_addr, u32 size, struct asic_fixed_properties *prop) { 537 + bool range_cross; 538 + 539 + if (range_type == HL_VA_RANGE_TYPE_DRAM) 540 + range_cross = 541 + hl_mem_area_crosses_range(start_addr, size, 542 + prop->hints_dram_reserved_va_range.start_addr, 543 + prop->hints_dram_reserved_va_range.end_addr); 544 + else if (range_type == HL_VA_RANGE_TYPE_HOST) 545 + range_cross = 546 + hl_mem_area_crosses_range(start_addr, size, 547 + prop->hints_host_reserved_va_range.start_addr, 548 + prop->hints_host_reserved_va_range.end_addr); 549 + else 550 + range_cross = 551 + hl_mem_area_crosses_range(start_addr, size, 552 + prop->hints_host_hpage_reserved_va_range.start_addr, 553 + prop->hints_host_hpage_reserved_va_range.end_addr); 554 + 555 + return range_cross; 556 + } 557 + 558 + /** 532 559 * get_va_block() - get a virtual block for the given size and alignment. 533 560 * 534 561 * @hdev: pointer to the habanalabs device structure. ··· 563 536 * @size: requested block size. 564 537 * @hint_addr: hint for requested address by the user. 565 538 * @va_block_align: required alignment of the virtual block start address. 539 + * @range_type: va range type (host, dram) 540 + * @flags: additional memory flags, currently only uses HL_MEM_FORCE_HINT 566 541 * 567 542 * This function does the following: 568 543 * - Iterate on the virtual block list to find a suitable virtual block for the ··· 574 545 */ 575 546 static u64 get_va_block(struct hl_device *hdev, 576 547 struct hl_va_range *va_range, 577 - u64 size, u64 hint_addr, u32 va_block_align) 548 + u64 size, u64 hint_addr, u32 va_block_align, 549 + enum hl_va_range_type range_type, 550 + u32 flags) 578 551 { 579 552 struct hl_vm_va_block *va_block, *new_va_block = NULL; 553 + struct asic_fixed_properties *prop = &hdev->asic_prop; 580 554 u64 tmp_hint_addr, valid_start, valid_size, prev_start, prev_end, 581 - align_mask, reserved_valid_start = 0, reserved_valid_size = 0; 555 + align_mask, reserved_valid_start = 0, reserved_valid_size = 0, 556 + dram_hint_mask = prop->dram_hints_align_mask; 582 557 bool add_prev = false; 583 558 bool is_align_pow_2 = is_power_of_2(va_range->page_size); 559 + bool is_hint_dram_addr = hl_is_dram_va(hdev, hint_addr); 560 + bool force_hint = flags & HL_MEM_FORCE_HINT; 584 561 585 562 if (is_align_pow_2) 586 563 align_mask = ~((u64)va_block_align - 1); ··· 599 564 size = DIV_ROUND_UP_ULL(size, va_range->page_size) * 600 565 va_range->page_size; 601 566 602 - tmp_hint_addr = hint_addr; 567 + tmp_hint_addr = hint_addr & ~dram_hint_mask; 603 568 604 569 /* Check if we need to ignore hint address */ 605 570 if ((is_align_pow_2 && (hint_addr & (va_block_align - 1))) || 606 - (!is_align_pow_2 && 607 - do_div(tmp_hint_addr, va_range->page_size))) { 571 + (!is_align_pow_2 && is_hint_dram_addr && 572 + do_div(tmp_hint_addr, va_range->page_size))) { 573 + 574 + if (force_hint) { 575 + /* Hint must be respected, so here we just fail */ 576 + dev_err(hdev->dev, 577 + "Hint address 0x%llx is not page aligned - cannot be respected\n", 578 + hint_addr); 579 + return 0; 580 + } 608 581 609 582 dev_dbg(hdev->dev, 610 583 "Hint address 0x%llx will be ignored because it is not aligned\n", ··· 639 596 if (valid_size < size) 640 597 continue; 641 598 599 + /* 600 + * In case hint address is 0, and arc_hints_range_reservation 601 + * property enabled, then avoid allocating va blocks from the 602 + * range reserved for hint addresses 603 + */ 604 + if (prop->hints_range_reservation && !hint_addr) 605 + if (is_hint_crossing_range(range_type, valid_start, 606 + size, prop)) 607 + continue; 608 + 642 609 /* Pick the minimal length block which has the required size */ 643 610 if (!new_va_block || (valid_size < reserved_valid_size)) { 644 611 new_va_block = va_block; ··· 668 615 if (!new_va_block) { 669 616 dev_err(hdev->dev, "no available va block for size %llu\n", 670 617 size); 618 + goto out; 619 + } 620 + 621 + if (force_hint && reserved_valid_start != hint_addr) { 622 + /* Hint address must be respected. If we are here - this means 623 + * we could not respect it. 624 + */ 625 + dev_err(hdev->dev, 626 + "Hint address 0x%llx could not be respected\n", 627 + hint_addr); 628 + reserved_valid_start = 0; 671 629 goto out; 672 630 } 673 631 ··· 734 670 enum hl_va_range_type type, u32 size, u32 alignment) 735 671 { 736 672 return get_va_block(hdev, ctx->va_range[type], size, 0, 737 - max(alignment, ctx->va_range[type]->page_size)); 673 + max(alignment, ctx->va_range[type]->page_size), 674 + type, 0); 738 675 } 739 676 740 677 /** ··· 797 732 } 798 733 799 734 /** 800 - * get_sg_info() - get number of pages and the DMA address from SG list. 801 - * @sg: the SG list. 802 - * @dma_addr: pointer to DMA address to return. 803 - * 804 - * Calculate the number of consecutive pages described by the SG list. Take the 805 - * offset of the address in the first page, add to it the length and round it up 806 - * to the number of needed pages. 807 - */ 808 - static u32 get_sg_info(struct scatterlist *sg, dma_addr_t *dma_addr) 809 - { 810 - *dma_addr = sg_dma_address(sg); 811 - 812 - return ((((*dma_addr) & (PAGE_SIZE - 1)) + sg_dma_len(sg)) + 813 - (PAGE_SIZE - 1)) >> PAGE_SHIFT; 814 - } 815 - 816 - /** 817 735 * init_phys_pg_pack_from_userptr() - initialize physical page pack from host 818 736 * memory 819 737 * @ctx: pointer to the context structure. 820 738 * @userptr: userptr to initialize from. 821 739 * @pphys_pg_pack: result pointer. 740 + * @force_regular_page: tell the function to ignore huge page optimization, 741 + * even if possible. Needed for cases where the device VA 742 + * is allocated before we know the composition of the 743 + * physical pages 822 744 * 823 745 * This function does the following: 824 746 * - Pin the physical pages related to the given virtual block. ··· 814 762 */ 815 763 static int init_phys_pg_pack_from_userptr(struct hl_ctx *ctx, 816 764 struct hl_userptr *userptr, 817 - struct hl_vm_phys_pg_pack **pphys_pg_pack) 765 + struct hl_vm_phys_pg_pack **pphys_pg_pack, 766 + bool force_regular_page) 818 767 { 819 - struct hl_vm_phys_pg_pack *phys_pg_pack; 820 - struct scatterlist *sg; 821 - dma_addr_t dma_addr; 822 - u64 page_mask, total_npages; 823 768 u32 npages, page_size = PAGE_SIZE, 824 769 huge_page_size = ctx->hdev->asic_prop.pmmu_huge.page_size; 825 - bool first = true, is_huge_page_opt = true; 826 - int rc, i, j; 827 770 u32 pgs_in_huge_page = huge_page_size >> __ffs(page_size); 771 + struct hl_vm_phys_pg_pack *phys_pg_pack; 772 + bool first = true, is_huge_page_opt; 773 + u64 page_mask, total_npages; 774 + struct scatterlist *sg; 775 + dma_addr_t dma_addr; 776 + int rc, i, j; 828 777 829 778 phys_pg_pack = kzalloc(sizeof(*phys_pg_pack), GFP_KERNEL); 830 779 if (!phys_pg_pack) ··· 836 783 phys_pg_pack->asid = ctx->asid; 837 784 atomic_set(&phys_pg_pack->mapping_cnt, 1); 838 785 786 + is_huge_page_opt = (force_regular_page ? false : true); 787 + 839 788 /* Only if all dma_addrs are aligned to 2MB and their 840 789 * sizes is at least 2MB, we can use huge page mapping. 841 790 * We limit the 2MB optimization to this condition, ··· 846 791 */ 847 792 total_npages = 0; 848 793 for_each_sg(userptr->sgt->sgl, sg, userptr->sgt->nents, i) { 849 - npages = get_sg_info(sg, &dma_addr); 794 + npages = hl_get_sg_info(sg, &dma_addr); 850 795 851 796 total_npages += npages; 852 797 ··· 875 820 876 821 j = 0; 877 822 for_each_sg(userptr->sgt->sgl, sg, userptr->sgt->nents, i) { 878 - npages = get_sg_info(sg, &dma_addr); 823 + npages = hl_get_sg_info(sg, &dma_addr); 879 824 880 825 /* align down to physical page size and save the offset */ 881 826 if (first) { ··· 1056 1001 struct hl_userptr *userptr = NULL; 1057 1002 struct hl_vm_hash_node *hnode; 1058 1003 struct hl_va_range *va_range; 1059 - enum vm_type_t *vm_type; 1004 + enum vm_type *vm_type; 1060 1005 u64 ret_vaddr, hint_addr; 1061 1006 u32 handle = 0, va_block_align; 1062 1007 int rc; 1063 1008 bool is_userptr = args->flags & HL_MEM_USERPTR; 1009 + enum hl_va_range_type va_range_type = 0; 1064 1010 1065 1011 /* Assume failure */ 1066 1012 *device_addr = 0; ··· 1079 1023 } 1080 1024 1081 1025 rc = init_phys_pg_pack_from_userptr(ctx, userptr, 1082 - &phys_pg_pack); 1026 + &phys_pg_pack, false); 1083 1027 if (rc) { 1084 1028 dev_err(hdev->dev, 1085 1029 "unable to init page pack for vaddr 0x%llx\n", ··· 1087 1031 goto init_page_pack_err; 1088 1032 } 1089 1033 1090 - vm_type = (enum vm_type_t *) userptr; 1034 + vm_type = (enum vm_type *) userptr; 1091 1035 hint_addr = args->map_host.hint_addr; 1092 1036 handle = phys_pg_pack->handle; 1093 1037 1094 1038 /* get required alignment */ 1095 1039 if (phys_pg_pack->page_size == page_size) { 1096 1040 va_range = ctx->va_range[HL_VA_RANGE_TYPE_HOST]; 1097 - 1041 + va_range_type = HL_VA_RANGE_TYPE_HOST; 1098 1042 /* 1099 1043 * huge page alignment may be needed in case of regular 1100 1044 * page mapping, depending on the host VA alignment ··· 1109 1053 * mapping 1110 1054 */ 1111 1055 va_range = ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]; 1056 + va_range_type = HL_VA_RANGE_TYPE_HOST_HUGE; 1112 1057 va_block_align = huge_page_size; 1113 1058 } 1114 1059 } else { ··· 1129 1072 1130 1073 spin_unlock(&vm->idr_lock); 1131 1074 1132 - vm_type = (enum vm_type_t *) phys_pg_pack; 1075 + vm_type = (enum vm_type *) phys_pg_pack; 1133 1076 1134 1077 hint_addr = args->map_device.hint_addr; 1135 1078 1136 1079 /* DRAM VA alignment is the same as the MMU page size */ 1137 1080 va_range = ctx->va_range[HL_VA_RANGE_TYPE_DRAM]; 1081 + va_range_type = HL_VA_RANGE_TYPE_DRAM; 1138 1082 va_block_align = hdev->asic_prop.dmmu.page_size; 1139 1083 } 1140 1084 ··· 1158 1100 goto hnode_err; 1159 1101 } 1160 1102 1103 + if (hint_addr && phys_pg_pack->offset) { 1104 + if (args->flags & HL_MEM_FORCE_HINT) { 1105 + /* Fail if hint must be respected but it can't be */ 1106 + dev_err(hdev->dev, 1107 + "Hint address 0x%llx cannot be respected because source memory is not aligned 0x%x\n", 1108 + hint_addr, phys_pg_pack->offset); 1109 + rc = -EINVAL; 1110 + goto va_block_err; 1111 + } 1112 + dev_dbg(hdev->dev, 1113 + "Hint address 0x%llx will be ignored because source memory is not aligned 0x%x\n", 1114 + hint_addr, phys_pg_pack->offset); 1115 + } 1116 + 1161 1117 ret_vaddr = get_va_block(hdev, va_range, phys_pg_pack->total_size, 1162 - hint_addr, va_block_align); 1118 + hint_addr, va_block_align, 1119 + va_range_type, args->flags); 1163 1120 if (!ret_vaddr) { 1164 1121 dev_err(hdev->dev, "no available va block for handle %u\n", 1165 1122 handle); ··· 1254 1181 static int unmap_device_va(struct hl_ctx *ctx, struct hl_mem_in *args, 1255 1182 bool ctx_free) 1256 1183 { 1257 - struct hl_device *hdev = ctx->hdev; 1258 - struct asic_fixed_properties *prop = &hdev->asic_prop; 1259 1184 struct hl_vm_phys_pg_pack *phys_pg_pack = NULL; 1185 + u64 vaddr = args->unmap.device_virt_addr; 1260 1186 struct hl_vm_hash_node *hnode = NULL; 1187 + struct asic_fixed_properties *prop; 1188 + struct hl_device *hdev = ctx->hdev; 1261 1189 struct hl_userptr *userptr = NULL; 1262 1190 struct hl_va_range *va_range; 1263 - u64 vaddr = args->unmap.device_virt_addr; 1264 - enum vm_type_t *vm_type; 1191 + enum vm_type *vm_type; 1265 1192 bool is_userptr; 1266 1193 int rc = 0; 1194 + 1195 + prop = &hdev->asic_prop; 1267 1196 1268 1197 /* protect from double entrance */ 1269 1198 mutex_lock(&ctx->mem_hash_lock); ··· 1289 1214 if (*vm_type == VM_TYPE_USERPTR) { 1290 1215 is_userptr = true; 1291 1216 userptr = hnode->ptr; 1292 - rc = init_phys_pg_pack_from_userptr(ctx, userptr, 1293 - &phys_pg_pack); 1217 + 1218 + rc = init_phys_pg_pack_from_userptr(ctx, userptr, &phys_pg_pack, 1219 + false); 1294 1220 if (rc) { 1295 1221 dev_err(hdev->dev, 1296 1222 "unable to init page pack for vaddr 0x%llx\n", ··· 1375 1299 kfree(hnode); 1376 1300 1377 1301 if (is_userptr) { 1378 - rc = free_phys_pg_pack(hdev, phys_pg_pack); 1302 + free_phys_pg_pack(hdev, phys_pg_pack); 1379 1303 dma_unmap_host_va(hdev, userptr); 1380 1304 } 1381 1305 ··· 1745 1669 return -EINVAL; 1746 1670 } 1747 1671 1672 + userptr->pid = current->pid; 1748 1673 userptr->sgt = kzalloc(sizeof(*userptr->sgt), GFP_KERNEL); 1749 1674 if (!userptr->sgt) 1750 1675 return -ENOMEM; ··· 2110 2033 * another side effect error 2111 2034 */ 2112 2035 if (!hdev->hard_reset_pending && !hash_empty(ctx->mem_hash)) 2113 - dev_notice(hdev->dev, 2036 + dev_dbg(hdev->dev, 2114 2037 "user released device without removing its memory mappings\n"); 2115 2038 2116 2039 hash_for_each_safe(ctx->mem_hash, i, tmp_node, hnode, node) {

+6 -6

drivers/misc/habanalabs/common/mmu/mmu_v1.c

··· 470 470 if (!ZERO_OR_NULL_PTR(hdev->mmu_priv.hr.mmu_shadow_hop0)) { 471 471 kvfree(hdev->mmu_priv.dr.mmu_shadow_hop0); 472 472 gen_pool_destroy(hdev->mmu_priv.dr.mmu_pgt_pool); 473 - } 474 473 475 - /* Make sure that if we arrive here again without init was called we 476 - * won't cause kernel panic. This can happen for example if we fail 477 - * during hard reset code at certain points 478 - */ 479 - hdev->mmu_priv.dr.mmu_shadow_hop0 = NULL; 474 + /* Make sure that if we arrive here again without init was 475 + * called we won't cause kernel panic. This can happen for 476 + * example if we fail during hard reset code at certain points 477 + */ 478 + hdev->mmu_priv.dr.mmu_shadow_hop0 = NULL; 479 + } 480 480 } 481 481 482 482 /**

+2

drivers/misc/habanalabs/common/pci/pci.c

··· 436 436 goto unmap_pci_bars; 437 437 } 438 438 439 + dma_set_max_seg_size(&pdev->dev, U32_MAX); 440 + 439 441 return 0; 440 442 441 443 unmap_pci_bars:

+718

drivers/misc/habanalabs/common/state_dump.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + /* 4 + * Copyright 2021 HabanaLabs, Ltd. 5 + * All Rights Reserved. 6 + */ 7 + 8 + #include <linux/vmalloc.h> 9 + #include <uapi/misc/habanalabs.h> 10 + #include "habanalabs.h" 11 + 12 + /** 13 + * hl_format_as_binary - helper function, format an integer as binary 14 + * using supplied scratch buffer 15 + * @buf: the buffer to use 16 + * @buf_len: buffer capacity 17 + * @n: number to format 18 + * 19 + * Returns pointer to buffer 20 + */ 21 + char *hl_format_as_binary(char *buf, size_t buf_len, u32 n) 22 + { 23 + int i; 24 + u32 bit; 25 + bool leading0 = true; 26 + char *wrptr = buf; 27 + 28 + if (buf_len > 0 && buf_len < 3) { 29 + *wrptr = '\0'; 30 + return buf; 31 + } 32 + 33 + wrptr[0] = '0'; 34 + wrptr[1] = 'b'; 35 + wrptr += 2; 36 + /* Remove 3 characters from length for '0b' and '\0' termination */ 37 + buf_len -= 3; 38 + 39 + for (i = 0; i < sizeof(n) * BITS_PER_BYTE && buf_len; ++i, n <<= 1) { 40 + /* Writing bit calculation in one line would cause a false 41 + * positive static code analysis error, so splitting. 42 + */ 43 + bit = n & (1 << (sizeof(n) * BITS_PER_BYTE - 1)); 44 + bit = !!bit; 45 + leading0 &= !bit; 46 + if (!leading0) { 47 + *wrptr = '0' + bit; 48 + ++wrptr; 49 + } 50 + } 51 + 52 + *wrptr = '\0'; 53 + 54 + return buf; 55 + } 56 + 57 + /** 58 + * resize_to_fit - helper function, resize buffer to fit given amount of data 59 + * @buf: destination buffer double pointer 60 + * @size: pointer to the size container 61 + * @desired_size: size the buffer must contain 62 + * 63 + * Returns 0 on success or error code on failure. 64 + * On success, the size of buffer is at least desired_size. Buffer is allocated 65 + * via vmalloc and must be freed with vfree. 66 + */ 67 + static int resize_to_fit(char **buf, size_t *size, size_t desired_size) 68 + { 69 + char *resized_buf; 70 + size_t new_size; 71 + 72 + if (*size >= desired_size) 73 + return 0; 74 + 75 + /* Not enough space to print all, have to resize */ 76 + new_size = max_t(size_t, PAGE_SIZE, round_up(desired_size, PAGE_SIZE)); 77 + resized_buf = vmalloc(new_size); 78 + if (!resized_buf) 79 + return -ENOMEM; 80 + memcpy(resized_buf, *buf, *size); 81 + vfree(*buf); 82 + *buf = resized_buf; 83 + *size = new_size; 84 + 85 + return 1; 86 + } 87 + 88 + /** 89 + * hl_snprintf_resize() - print formatted data to buffer, resize as needed 90 + * @buf: buffer double pointer, to be written to and resized, must be either 91 + * NULL or allocated with vmalloc. 92 + * @size: current size of the buffer 93 + * @offset: current offset to write to 94 + * @format: format of the data 95 + * 96 + * This function will write formatted data into the buffer. If buffer is not 97 + * large enough, it will be resized using vmalloc. Size may be modified if the 98 + * buffer was resized, offset will be advanced by the number of bytes written 99 + * not including the terminating character 100 + * 101 + * Returns 0 on success or error code on failure 102 + * 103 + * Note that the buffer has to be manually released using vfree. 104 + */ 105 + int hl_snprintf_resize(char **buf, size_t *size, size_t *offset, 106 + const char *format, ...) 107 + { 108 + va_list args; 109 + size_t length; 110 + int rc; 111 + 112 + if (*buf == NULL && (*size != 0 || *offset != 0)) 113 + return -EINVAL; 114 + 115 + va_start(args, format); 116 + length = vsnprintf(*buf + *offset, *size - *offset, format, args); 117 + va_end(args); 118 + 119 + rc = resize_to_fit(buf, size, *offset + length + 1); 120 + if (rc < 0) 121 + return rc; 122 + else if (rc > 0) { 123 + /* Resize was needed, write again */ 124 + va_start(args, format); 125 + length = vsnprintf(*buf + *offset, *size - *offset, format, 126 + args); 127 + va_end(args); 128 + } 129 + 130 + *offset += length; 131 + 132 + return 0; 133 + } 134 + 135 + /** 136 + * hl_sync_engine_to_string - convert engine type enum to string literal 137 + * @engine_type: engine type (TPC/MME/DMA) 138 + * 139 + * Return the resolved string literal 140 + */ 141 + const char *hl_sync_engine_to_string(enum hl_sync_engine_type engine_type) 142 + { 143 + switch (engine_type) { 144 + case ENGINE_DMA: 145 + return "DMA"; 146 + case ENGINE_MME: 147 + return "MME"; 148 + case ENGINE_TPC: 149 + return "TPC"; 150 + } 151 + return "Invalid Engine Type"; 152 + } 153 + 154 + /** 155 + * hl_print_resize_sync_engine - helper function, format engine name and ID 156 + * using hl_snprintf_resize 157 + * @buf: destination buffer double pointer to be used with hl_snprintf_resize 158 + * @size: pointer to the size container 159 + * @offset: pointer to the offset container 160 + * @engine_type: engine type (TPC/MME/DMA) 161 + * @engine_id: engine numerical id 162 + * 163 + * Returns 0 on success or error code on failure 164 + */ 165 + static int hl_print_resize_sync_engine(char **buf, size_t *size, size_t *offset, 166 + enum hl_sync_engine_type engine_type, 167 + u32 engine_id) 168 + { 169 + return hl_snprintf_resize(buf, size, offset, "%s%u", 170 + hl_sync_engine_to_string(engine_type), engine_id); 171 + } 172 + 173 + /** 174 + * hl_state_dump_get_sync_name - transform sync object id to name if available 175 + * @hdev: pointer to the device 176 + * @sync_id: sync object id 177 + * 178 + * Returns a name literal or NULL if not resolved. 179 + * Note: returning NULL shall not be considered as a failure, as not all 180 + * sync objects are named. 181 + */ 182 + const char *hl_state_dump_get_sync_name(struct hl_device *hdev, u32 sync_id) 183 + { 184 + struct hl_state_dump_specs *sds = &hdev->state_dump_specs; 185 + struct hl_hw_obj_name_entry *entry; 186 + 187 + hash_for_each_possible(sds->so_id_to_str_tb, entry, 188 + node, sync_id) 189 + if (sync_id == entry->id) 190 + return entry->name; 191 + 192 + return NULL; 193 + } 194 + 195 + /** 196 + * hl_state_dump_get_monitor_name - transform monitor object dump to monitor 197 + * name if available 198 + * @hdev: pointer to the device 199 + * @mon: monitor state dump 200 + * 201 + * Returns a name literal or NULL if not resolved. 202 + * Note: returning NULL shall not be considered as a failure, as not all 203 + * monitors are named. 204 + */ 205 + const char *hl_state_dump_get_monitor_name(struct hl_device *hdev, 206 + struct hl_mon_state_dump *mon) 207 + { 208 + struct hl_state_dump_specs *sds = &hdev->state_dump_specs; 209 + struct hl_hw_obj_name_entry *entry; 210 + 211 + hash_for_each_possible(sds->monitor_id_to_str_tb, 212 + entry, node, mon->id) 213 + if (mon->id == entry->id) 214 + return entry->name; 215 + 216 + return NULL; 217 + } 218 + 219 + /** 220 + * hl_state_dump_free_sync_to_engine_map - free sync object to engine map 221 + * @map: sync object to engine map 222 + * 223 + * Note: generic free implementation, the allocation is implemented per ASIC. 224 + */ 225 + void hl_state_dump_free_sync_to_engine_map(struct hl_sync_to_engine_map *map) 226 + { 227 + struct hl_sync_to_engine_map_entry *entry; 228 + struct hlist_node *tmp_node; 229 + int i; 230 + 231 + hash_for_each_safe(map->tb, i, tmp_node, entry, node) { 232 + hash_del(&entry->node); 233 + kfree(entry); 234 + } 235 + } 236 + 237 + /** 238 + * hl_state_dump_get_sync_to_engine - transform sync_id to 239 + * hl_sync_to_engine_map_entry if available for current id 240 + * @map: sync object to engine map 241 + * @sync_id: sync object id 242 + * 243 + * Returns the translation entry if found or NULL if not. 244 + * Note, returned NULL shall not be considered as a failure as the map 245 + * does not cover all possible, it is a best effort sync ids. 246 + */ 247 + static struct hl_sync_to_engine_map_entry * 248 + hl_state_dump_get_sync_to_engine(struct hl_sync_to_engine_map *map, u32 sync_id) 249 + { 250 + struct hl_sync_to_engine_map_entry *entry; 251 + 252 + hash_for_each_possible(map->tb, entry, node, sync_id) 253 + if (entry->sync_id == sync_id) 254 + return entry; 255 + return NULL; 256 + } 257 + 258 + /** 259 + * hl_state_dump_read_sync_objects - read sync objects array 260 + * @hdev: pointer to the device 261 + * @index: sync manager block index starting with E_N 262 + * 263 + * Returns array of size SP_SYNC_OBJ_AMOUNT on success or NULL on failure 264 + */ 265 + static u32 *hl_state_dump_read_sync_objects(struct hl_device *hdev, u32 index) 266 + { 267 + struct hl_state_dump_specs *sds = &hdev->state_dump_specs; 268 + u32 *sync_objects; 269 + s64 base_addr; /* Base addr can be negative */ 270 + int i; 271 + 272 + base_addr = sds->props[SP_SYNC_OBJ_BASE_ADDR] + 273 + sds->props[SP_NEXT_SYNC_OBJ_ADDR] * index; 274 + 275 + sync_objects = vmalloc(sds->props[SP_SYNC_OBJ_AMOUNT] * sizeof(u32)); 276 + if (!sync_objects) 277 + return NULL; 278 + 279 + for (i = 0; i < sds->props[SP_SYNC_OBJ_AMOUNT]; ++i) 280 + sync_objects[i] = RREG32(base_addr + i * sizeof(u32)); 281 + 282 + return sync_objects; 283 + } 284 + 285 + /** 286 + * hl_state_dump_free_sync_objects - free sync objects array allocated by 287 + * hl_state_dump_read_sync_objects 288 + * @sync_objects: sync objects array 289 + */ 290 + static void hl_state_dump_free_sync_objects(u32 *sync_objects) 291 + { 292 + vfree(sync_objects); 293 + } 294 + 295 + 296 + /** 297 + * hl_state_dump_print_syncs_single_block - print active sync objects on a 298 + * single block 299 + * @hdev: pointer to the device 300 + * @index: sync manager block index starting with E_N 301 + * @buf: destination buffer double pointer to be used with hl_snprintf_resize 302 + * @size: pointer to the size container 303 + * @offset: pointer to the offset container 304 + * @map: sync engines names map 305 + * 306 + * Returns 0 on success or error code on failure 307 + */ 308 + static int 309 + hl_state_dump_print_syncs_single_block(struct hl_device *hdev, u32 index, 310 + char **buf, size_t *size, size_t *offset, 311 + struct hl_sync_to_engine_map *map) 312 + { 313 + struct hl_state_dump_specs *sds = &hdev->state_dump_specs; 314 + const char *sync_name; 315 + u32 *sync_objects = NULL; 316 + int rc = 0, i; 317 + 318 + if (sds->sync_namager_names) { 319 + rc = hl_snprintf_resize( 320 + buf, size, offset, "%s\n", 321 + sds->sync_namager_names[index]); 322 + if (rc) 323 + goto out; 324 + } 325 + 326 + sync_objects = hl_state_dump_read_sync_objects(hdev, index); 327 + if (!sync_objects) { 328 + rc = -ENOMEM; 329 + goto out; 330 + } 331 + 332 + for (i = 0; i < sds->props[SP_SYNC_OBJ_AMOUNT]; ++i) { 333 + struct hl_sync_to_engine_map_entry *entry; 334 + u64 sync_object_addr; 335 + 336 + if (!sync_objects[i]) 337 + continue; 338 + 339 + sync_object_addr = sds->props[SP_SYNC_OBJ_BASE_ADDR] + 340 + sds->props[SP_NEXT_SYNC_OBJ_ADDR] * index + 341 + i * sizeof(u32); 342 + 343 + rc = hl_snprintf_resize(buf, size, offset, "sync id: %u", i); 344 + if (rc) 345 + goto free_sync_objects; 346 + sync_name = hl_state_dump_get_sync_name(hdev, i); 347 + if (sync_name) { 348 + rc = hl_snprintf_resize(buf, size, offset, " %s", 349 + sync_name); 350 + if (rc) 351 + goto free_sync_objects; 352 + } 353 + rc = hl_snprintf_resize(buf, size, offset, ", value: %u", 354 + sync_objects[i]); 355 + if (rc) 356 + goto free_sync_objects; 357 + 358 + /* Append engine string */ 359 + entry = hl_state_dump_get_sync_to_engine(map, 360 + (u32)sync_object_addr); 361 + if (entry) { 362 + rc = hl_snprintf_resize(buf, size, offset, 363 + ", Engine: "); 364 + if (rc) 365 + goto free_sync_objects; 366 + rc = hl_print_resize_sync_engine(buf, size, offset, 367 + entry->engine_type, 368 + entry->engine_id); 369 + if (rc) 370 + goto free_sync_objects; 371 + } 372 + 373 + rc = hl_snprintf_resize(buf, size, offset, "\n"); 374 + if (rc) 375 + goto free_sync_objects; 376 + } 377 + 378 + free_sync_objects: 379 + hl_state_dump_free_sync_objects(sync_objects); 380 + out: 381 + return rc; 382 + } 383 + 384 + /** 385 + * hl_state_dump_print_syncs - print active sync objects 386 + * @hdev: pointer to the device 387 + * @buf: destination buffer double pointer to be used with hl_snprintf_resize 388 + * @size: pointer to the size container 389 + * @offset: pointer to the offset container 390 + * 391 + * Returns 0 on success or error code on failure 392 + */ 393 + static int hl_state_dump_print_syncs(struct hl_device *hdev, 394 + char **buf, size_t *size, 395 + size_t *offset) 396 + 397 + { 398 + struct hl_state_dump_specs *sds = &hdev->state_dump_specs; 399 + struct hl_sync_to_engine_map *map; 400 + u32 index; 401 + int rc = 0; 402 + 403 + map = kzalloc(sizeof(*map), GFP_KERNEL); 404 + if (!map) 405 + return -ENOMEM; 406 + 407 + rc = sds->funcs.gen_sync_to_engine_map(hdev, map); 408 + if (rc) 409 + goto free_map_mem; 410 + 411 + rc = hl_snprintf_resize(buf, size, offset, "Non zero sync objects:\n"); 412 + if (rc) 413 + goto out; 414 + 415 + if (sds->sync_namager_names) { 416 + for (index = 0; sds->sync_namager_names[index]; ++index) { 417 + rc = hl_state_dump_print_syncs_single_block( 418 + hdev, index, buf, size, offset, map); 419 + if (rc) 420 + goto out; 421 + } 422 + } else { 423 + for (index = 0; index < sds->props[SP_NUM_CORES]; ++index) { 424 + rc = hl_state_dump_print_syncs_single_block( 425 + hdev, index, buf, size, offset, map); 426 + if (rc) 427 + goto out; 428 + } 429 + } 430 + 431 + out: 432 + hl_state_dump_free_sync_to_engine_map(map); 433 + free_map_mem: 434 + kfree(map); 435 + 436 + return rc; 437 + } 438 + 439 + /** 440 + * hl_state_dump_alloc_read_sm_block_monitors - read monitors for a specific 441 + * block 442 + * @hdev: pointer to the device 443 + * @index: sync manager block index starting with E_N 444 + * 445 + * Returns an array of monitor data of size SP_MONITORS_AMOUNT or NULL 446 + * on error 447 + */ 448 + static struct hl_mon_state_dump * 449 + hl_state_dump_alloc_read_sm_block_monitors(struct hl_device *hdev, u32 index) 450 + { 451 + struct hl_state_dump_specs *sds = &hdev->state_dump_specs; 452 + struct hl_mon_state_dump *monitors; 453 + s64 base_addr; /* Base addr can be negative */ 454 + int i; 455 + 456 + monitors = vmalloc(sds->props[SP_MONITORS_AMOUNT] * 457 + sizeof(struct hl_mon_state_dump)); 458 + if (!monitors) 459 + return NULL; 460 + 461 + base_addr = sds->props[SP_NEXT_SYNC_OBJ_ADDR] * index; 462 + 463 + for (i = 0; i < sds->props[SP_MONITORS_AMOUNT]; ++i) { 464 + monitors[i].id = i; 465 + monitors[i].wr_addr_low = 466 + RREG32(base_addr + sds->props[SP_MON_OBJ_WR_ADDR_LOW] + 467 + i * sizeof(u32)); 468 + 469 + monitors[i].wr_addr_high = 470 + RREG32(base_addr + sds->props[SP_MON_OBJ_WR_ADDR_HIGH] + 471 + i * sizeof(u32)); 472 + 473 + monitors[i].wr_data = 474 + RREG32(base_addr + sds->props[SP_MON_OBJ_WR_DATA] + 475 + i * sizeof(u32)); 476 + 477 + monitors[i].arm_data = 478 + RREG32(base_addr + sds->props[SP_MON_OBJ_ARM_DATA] + 479 + i * sizeof(u32)); 480 + 481 + monitors[i].status = 482 + RREG32(base_addr + sds->props[SP_MON_OBJ_STATUS] + 483 + i * sizeof(u32)); 484 + } 485 + 486 + return monitors; 487 + } 488 + 489 + /** 490 + * hl_state_dump_free_monitors - free the monitors structure 491 + * @monitors: monitors array created with 492 + * hl_state_dump_alloc_read_sm_block_monitors 493 + */ 494 + static void hl_state_dump_free_monitors(struct hl_mon_state_dump *monitors) 495 + { 496 + vfree(monitors); 497 + } 498 + 499 + /** 500 + * hl_state_dump_print_monitors_single_block - print active monitors on a 501 + * single block 502 + * @hdev: pointer to the device 503 + * @index: sync manager block index starting with E_N 504 + * @buf: destination buffer double pointer to be used with hl_snprintf_resize 505 + * @size: pointer to the size container 506 + * @offset: pointer to the offset container 507 + * 508 + * Returns 0 on success or error code on failure 509 + */ 510 + static int hl_state_dump_print_monitors_single_block(struct hl_device *hdev, 511 + u32 index, 512 + char **buf, size_t *size, 513 + size_t *offset) 514 + { 515 + struct hl_state_dump_specs *sds = &hdev->state_dump_specs; 516 + struct hl_mon_state_dump *monitors = NULL; 517 + int rc = 0, i; 518 + 519 + if (sds->sync_namager_names) { 520 + rc = hl_snprintf_resize( 521 + buf, size, offset, "%s\n", 522 + sds->sync_namager_names[index]); 523 + if (rc) 524 + goto out; 525 + } 526 + 527 + monitors = hl_state_dump_alloc_read_sm_block_monitors(hdev, index); 528 + if (!monitors) { 529 + rc = -ENOMEM; 530 + goto out; 531 + } 532 + 533 + for (i = 0; i < sds->props[SP_MONITORS_AMOUNT]; ++i) { 534 + if (!(sds->funcs.monitor_valid(&monitors[i]))) 535 + continue; 536 + 537 + /* Monitor is valid, dump it */ 538 + rc = sds->funcs.print_single_monitor(buf, size, offset, hdev, 539 + &monitors[i]); 540 + if (rc) 541 + goto free_monitors; 542 + 543 + hl_snprintf_resize(buf, size, offset, "\n"); 544 + } 545 + 546 + free_monitors: 547 + hl_state_dump_free_monitors(monitors); 548 + out: 549 + return rc; 550 + } 551 + 552 + /** 553 + * hl_state_dump_print_monitors - print active monitors 554 + * @hdev: pointer to the device 555 + * @buf: destination buffer double pointer to be used with hl_snprintf_resize 556 + * @size: pointer to the size container 557 + * @offset: pointer to the offset container 558 + * 559 + * Returns 0 on success or error code on failure 560 + */ 561 + static int hl_state_dump_print_monitors(struct hl_device *hdev, 562 + char **buf, size_t *size, 563 + size_t *offset) 564 + { 565 + struct hl_state_dump_specs *sds = &hdev->state_dump_specs; 566 + u32 index; 567 + int rc = 0; 568 + 569 + rc = hl_snprintf_resize(buf, size, offset, 570 + "Valid (armed) monitor objects:\n"); 571 + if (rc) 572 + goto out; 573 + 574 + if (sds->sync_namager_names) { 575 + for (index = 0; sds->sync_namager_names[index]; ++index) { 576 + rc = hl_state_dump_print_monitors_single_block( 577 + hdev, index, buf, size, offset); 578 + if (rc) 579 + goto out; 580 + } 581 + } else { 582 + for (index = 0; index < sds->props[SP_NUM_CORES]; ++index) { 583 + rc = hl_state_dump_print_monitors_single_block( 584 + hdev, index, buf, size, offset); 585 + if (rc) 586 + goto out; 587 + } 588 + } 589 + 590 + out: 591 + return rc; 592 + } 593 + 594 + /** 595 + * hl_state_dump_print_engine_fences - print active fences for a specific 596 + * engine 597 + * @hdev: pointer to the device 598 + * @engine_type: engine type to use 599 + * @buf: destination buffer double pointer to be used with hl_snprintf_resize 600 + * @size: pointer to the size container 601 + * @offset: pointer to the offset container 602 + */ 603 + static int 604 + hl_state_dump_print_engine_fences(struct hl_device *hdev, 605 + enum hl_sync_engine_type engine_type, 606 + char **buf, size_t *size, size_t *offset) 607 + { 608 + struct hl_state_dump_specs *sds = &hdev->state_dump_specs; 609 + int rc = 0, i, n_fences; 610 + u64 base_addr, next_fence; 611 + 612 + switch (engine_type) { 613 + case ENGINE_TPC: 614 + n_fences = sds->props[SP_NUM_OF_TPC_ENGINES]; 615 + base_addr = sds->props[SP_TPC0_CMDQ]; 616 + next_fence = sds->props[SP_NEXT_TPC]; 617 + break; 618 + case ENGINE_MME: 619 + n_fences = sds->props[SP_NUM_OF_MME_ENGINES]; 620 + base_addr = sds->props[SP_MME_CMDQ]; 621 + next_fence = sds->props[SP_NEXT_MME]; 622 + break; 623 + case ENGINE_DMA: 624 + n_fences = sds->props[SP_NUM_OF_DMA_ENGINES]; 625 + base_addr = sds->props[SP_DMA_CMDQ]; 626 + next_fence = sds->props[SP_DMA_QUEUES_OFFSET]; 627 + break; 628 + default: 629 + return -EINVAL; 630 + } 631 + for (i = 0; i < n_fences; ++i) { 632 + rc = sds->funcs.print_fences_single_engine( 633 + hdev, 634 + base_addr + next_fence * i + 635 + sds->props[SP_FENCE0_CNT_OFFSET], 636 + base_addr + next_fence * i + 637 + sds->props[SP_CP_STS_OFFSET], 638 + engine_type, i, buf, size, offset); 639 + if (rc) 640 + goto out; 641 + } 642 + out: 643 + return rc; 644 + } 645 + 646 + /** 647 + * hl_state_dump_print_fences - print active fences 648 + * @hdev: pointer to the device 649 + * @buf: destination buffer double pointer to be used with hl_snprintf_resize 650 + * @size: pointer to the size container 651 + * @offset: pointer to the offset container 652 + */ 653 + static int hl_state_dump_print_fences(struct hl_device *hdev, char **buf, 654 + size_t *size, size_t *offset) 655 + { 656 + int rc = 0; 657 + 658 + rc = hl_snprintf_resize(buf, size, offset, "Valid (armed) fences:\n"); 659 + if (rc) 660 + goto out; 661 + 662 + rc = hl_state_dump_print_engine_fences(hdev, ENGINE_TPC, buf, size, offset); 663 + if (rc) 664 + goto out; 665 + 666 + rc = hl_state_dump_print_engine_fences(hdev, ENGINE_MME, buf, size, offset); 667 + if (rc) 668 + goto out; 669 + 670 + rc = hl_state_dump_print_engine_fences(hdev, ENGINE_DMA, buf, size, offset); 671 + if (rc) 672 + goto out; 673 + 674 + out: 675 + return rc; 676 + } 677 + 678 + /** 679 + * hl_state_dump() - dump system state 680 + * @hdev: pointer to device structure 681 + */ 682 + int hl_state_dump(struct hl_device *hdev) 683 + { 684 + char *buf = NULL; 685 + size_t offset = 0, size = 0; 686 + int rc; 687 + 688 + rc = hl_snprintf_resize(&buf, &size, &offset, 689 + "Timestamp taken on: %llu\n\n", 690 + ktime_to_ns(ktime_get())); 691 + if (rc) 692 + goto err; 693 + 694 + rc = hl_state_dump_print_syncs(hdev, &buf, &size, &offset); 695 + if (rc) 696 + goto err; 697 + 698 + hl_snprintf_resize(&buf, &size, &offset, "\n"); 699 + 700 + rc = hl_state_dump_print_monitors(hdev, &buf, &size, &offset); 701 + if (rc) 702 + goto err; 703 + 704 + hl_snprintf_resize(&buf, &size, &offset, "\n"); 705 + 706 + rc = hl_state_dump_print_fences(hdev, &buf, &size, &offset); 707 + if (rc) 708 + goto err; 709 + 710 + hl_snprintf_resize(&buf, &size, &offset, "\n"); 711 + 712 + hl_debugfs_set_state_dump(hdev, buf, size); 713 + 714 + return 0; 715 + err: 716 + vfree(buf); 717 + return rc; 718 + }

+7 -13

drivers/misc/habanalabs/common/sysfs.c

··· 9 9 10 10 #include <linux/pci.h> 11 11 12 - long hl_get_frequency(struct hl_device *hdev, u32 pll_index, 13 - bool curr) 12 + long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr) 14 13 { 15 14 struct cpucp_packet pkt; 16 15 u32 used_pll_idx; ··· 43 44 return (long) result; 44 45 } 45 46 46 - void hl_set_frequency(struct hl_device *hdev, u32 pll_index, 47 - u64 freq) 47 + void hl_set_frequency(struct hl_device *hdev, u32 pll_index, u64 freq) 48 48 { 49 49 struct cpucp_packet pkt; 50 50 u32 used_pll_idx; ··· 283 285 char *buf) 284 286 { 285 287 struct hl_device *hdev = dev_get_drvdata(dev); 286 - char *str; 288 + char str[HL_STR_MAX]; 287 289 288 - if (atomic_read(&hdev->in_reset)) 289 - str = "In reset"; 290 - else if (hdev->disabled) 291 - str = "Malfunction"; 292 - else if (hdev->needs_reset) 293 - str = "Needs Reset"; 294 - else 295 - str = "Operational"; 290 + strscpy(str, hdev->status[hl_device_status(hdev)], HL_STR_MAX); 291 + 292 + /* use uppercase for backward compatibility */ 293 + str[0] = 'A' + (str[0] - 'a'); 296 294 297 295 return sprintf(buf, "%s\n", str); 298 296 }

+571 -145

drivers/misc/habanalabs/gaudi/gaudi.c

··· 76 76 #define GAUDI_PLDM_MMU_TIMEOUT_USEC (MMU_CONFIG_TIMEOUT_USEC * 100) 77 77 #define GAUDI_PLDM_QMAN0_TIMEOUT_USEC (HL_DEVICE_TIMEOUT_USEC * 30) 78 78 #define GAUDI_PLDM_TPC_KERNEL_WAIT_USEC (HL_DEVICE_TIMEOUT_USEC * 30) 79 - #define GAUDI_BOOT_FIT_REQ_TIMEOUT_USEC 1000000 /* 1s */ 79 + #define GAUDI_BOOT_FIT_REQ_TIMEOUT_USEC 4000000 /* 4s */ 80 80 #define GAUDI_MSG_TO_CPU_TIMEOUT_USEC 4000000 /* 4s */ 81 81 #define GAUDI_WAIT_FOR_BL_TIMEOUT_USEC 15000000 /* 15s */ 82 82 ··· 105 105 #define HBM_SCRUBBING_TIMEOUT_US 1000000 /* 1s */ 106 106 107 107 #define GAUDI_PLL_MAX 10 108 + 109 + #define BIN_REG_STRING_SIZE sizeof("0b10101010101010101010101010101010") 110 + 111 + #define MONITOR_SOB_STRING_SIZE 256 112 + 113 + static u32 gaudi_stream_master[GAUDI_STREAM_MASTER_ARR_SIZE] = { 114 + GAUDI_QUEUE_ID_DMA_0_0, 115 + GAUDI_QUEUE_ID_DMA_0_1, 116 + GAUDI_QUEUE_ID_DMA_0_2, 117 + GAUDI_QUEUE_ID_DMA_0_3, 118 + GAUDI_QUEUE_ID_DMA_1_0, 119 + GAUDI_QUEUE_ID_DMA_1_1, 120 + GAUDI_QUEUE_ID_DMA_1_2, 121 + GAUDI_QUEUE_ID_DMA_1_3 122 + }; 108 123 109 124 static const char gaudi_irq_name[GAUDI_MSI_ENTRIES][GAUDI_MAX_STRING_LEN] = { 110 125 "gaudi cq 0_0", "gaudi cq 0_1", "gaudi cq 0_2", "gaudi cq 0_3", ··· 363 348 QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_9_3 */ 364 349 }; 365 350 351 + static struct hl_hw_obj_name_entry gaudi_so_id_to_str[] = { 352 + { .id = 0, .name = "SYNC_OBJ_DMA_DOWN_FEEDBACK" }, 353 + { .id = 1, .name = "SYNC_OBJ_DMA_UP_FEEDBACK" }, 354 + { .id = 2, .name = "SYNC_OBJ_DMA_STATIC_DRAM_SRAM_FEEDBACK" }, 355 + { .id = 3, .name = "SYNC_OBJ_DMA_SRAM_DRAM_FEEDBACK" }, 356 + { .id = 4, .name = "SYNC_OBJ_FIRST_COMPUTE_FINISH" }, 357 + { .id = 5, .name = "SYNC_OBJ_HOST_DRAM_DONE" }, 358 + { .id = 6, .name = "SYNC_OBJ_DBG_CTR_DEPRECATED" }, 359 + { .id = 7, .name = "SYNC_OBJ_DMA_ACTIVATIONS_DRAM_SRAM_FEEDBACK" }, 360 + { .id = 8, .name = "SYNC_OBJ_ENGINE_SEM_MME_0" }, 361 + { .id = 9, .name = "SYNC_OBJ_ENGINE_SEM_MME_1" }, 362 + { .id = 10, .name = "SYNC_OBJ_ENGINE_SEM_TPC_0" }, 363 + { .id = 11, .name = "SYNC_OBJ_ENGINE_SEM_TPC_1" }, 364 + { .id = 12, .name = "SYNC_OBJ_ENGINE_SEM_TPC_2" }, 365 + { .id = 13, .name = "SYNC_OBJ_ENGINE_SEM_TPC_3" }, 366 + { .id = 14, .name = "SYNC_OBJ_ENGINE_SEM_TPC_4" }, 367 + { .id = 15, .name = "SYNC_OBJ_ENGINE_SEM_TPC_5" }, 368 + { .id = 16, .name = "SYNC_OBJ_ENGINE_SEM_TPC_6" }, 369 + { .id = 17, .name = "SYNC_OBJ_ENGINE_SEM_TPC_7" }, 370 + { .id = 18, .name = "SYNC_OBJ_ENGINE_SEM_DMA_1" }, 371 + { .id = 19, .name = "SYNC_OBJ_ENGINE_SEM_DMA_2" }, 372 + { .id = 20, .name = "SYNC_OBJ_ENGINE_SEM_DMA_3" }, 373 + { .id = 21, .name = "SYNC_OBJ_ENGINE_SEM_DMA_4" }, 374 + { .id = 22, .name = "SYNC_OBJ_ENGINE_SEM_DMA_5" }, 375 + { .id = 23, .name = "SYNC_OBJ_ENGINE_SEM_DMA_6" }, 376 + { .id = 24, .name = "SYNC_OBJ_ENGINE_SEM_DMA_7" }, 377 + { .id = 25, .name = "SYNC_OBJ_DBG_CTR_0" }, 378 + { .id = 26, .name = "SYNC_OBJ_DBG_CTR_1" }, 379 + }; 380 + 381 + static struct hl_hw_obj_name_entry gaudi_monitor_id_to_str[] = { 382 + { .id = 200, .name = "MON_OBJ_DMA_DOWN_FEEDBACK_RESET" }, 383 + { .id = 201, .name = "MON_OBJ_DMA_UP_FEADBACK_RESET" }, 384 + { .id = 203, .name = "MON_OBJ_DRAM_TO_SRAM_QUEUE_FENCE" }, 385 + { .id = 204, .name = "MON_OBJ_TPC_0_CLK_GATE" }, 386 + { .id = 205, .name = "MON_OBJ_TPC_1_CLK_GATE" }, 387 + { .id = 206, .name = "MON_OBJ_TPC_2_CLK_GATE" }, 388 + { .id = 207, .name = "MON_OBJ_TPC_3_CLK_GATE" }, 389 + { .id = 208, .name = "MON_OBJ_TPC_4_CLK_GATE" }, 390 + { .id = 209, .name = "MON_OBJ_TPC_5_CLK_GATE" }, 391 + { .id = 210, .name = "MON_OBJ_TPC_6_CLK_GATE" }, 392 + { .id = 211, .name = "MON_OBJ_TPC_7_CLK_GATE" }, 393 + }; 394 + 395 + static s64 gaudi_state_dump_specs_props[] = { 396 + [SP_SYNC_OBJ_BASE_ADDR] = mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_SOB_OBJ_0, 397 + [SP_NEXT_SYNC_OBJ_ADDR] = NEXT_SYNC_OBJ_ADDR_INTERVAL, 398 + [SP_SYNC_OBJ_AMOUNT] = NUM_OF_SOB_IN_BLOCK, 399 + [SP_MON_OBJ_WR_ADDR_LOW] = 400 + mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0, 401 + [SP_MON_OBJ_WR_ADDR_HIGH] = 402 + mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_ADDRH_0, 403 + [SP_MON_OBJ_WR_DATA] = mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_DATA_0, 404 + [SP_MON_OBJ_ARM_DATA] = mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_ARM_0, 405 + [SP_MON_OBJ_STATUS] = mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_STATUS_0, 406 + [SP_MONITORS_AMOUNT] = NUM_OF_MONITORS_IN_BLOCK, 407 + [SP_TPC0_CMDQ] = mmTPC0_QM_GLBL_CFG0, 408 + [SP_TPC0_CFG_SO] = mmTPC0_CFG_QM_SYNC_OBJECT_ADDR, 409 + [SP_NEXT_TPC] = mmTPC1_QM_GLBL_CFG0 - mmTPC0_QM_GLBL_CFG0, 410 + [SP_MME_CMDQ] = mmMME0_QM_GLBL_CFG0, 411 + [SP_MME_CFG_SO] = mmMME0_CTRL_ARCH_DESC_SYNC_OBJECT_ADDR_LOW_LOCAL, 412 + [SP_NEXT_MME] = mmMME2_QM_GLBL_CFG0 - mmMME0_QM_GLBL_CFG0, 413 + [SP_DMA_CMDQ] = mmDMA0_QM_GLBL_CFG0, 414 + [SP_DMA_CFG_SO] = mmDMA0_CORE_WR_COMP_ADDR_LO, 415 + [SP_DMA_QUEUES_OFFSET] = mmDMA1_QM_GLBL_CFG0 - mmDMA0_QM_GLBL_CFG0, 416 + [SP_NUM_OF_MME_ENGINES] = NUM_OF_MME_ENGINES, 417 + [SP_SUB_MME_ENG_NUM] = NUM_OF_MME_SUB_ENGINES, 418 + [SP_NUM_OF_DMA_ENGINES] = NUM_OF_DMA_ENGINES, 419 + [SP_NUM_OF_TPC_ENGINES] = NUM_OF_TPC_ENGINES, 420 + [SP_ENGINE_NUM_OF_QUEUES] = NUM_OF_QUEUES, 421 + [SP_ENGINE_NUM_OF_STREAMS] = NUM_OF_STREAMS, 422 + [SP_ENGINE_NUM_OF_FENCES] = NUM_OF_FENCES, 423 + [SP_FENCE0_CNT_OFFSET] = 424 + mmDMA0_QM_CP_FENCE0_CNT_0 - mmDMA0_QM_GLBL_CFG0, 425 + [SP_FENCE0_RDATA_OFFSET] = 426 + mmDMA0_QM_CP_FENCE0_RDATA_0 - mmDMA0_QM_GLBL_CFG0, 427 + [SP_CP_STS_OFFSET] = mmDMA0_QM_CP_STS_0 - mmDMA0_QM_GLBL_CFG0, 428 + [SP_NUM_CORES] = 1, 429 + }; 430 + 431 + /* The order here is opposite to the order of the indexing in the h/w. 432 + * i.e. SYNC_MGR_W_S is actually 0, SYNC_MGR_E_S is 1, etc. 433 + */ 434 + static const char * const gaudi_sync_manager_names[] = { 435 + "SYNC_MGR_E_N", 436 + "SYNC_MGR_W_N", 437 + "SYNC_MGR_E_S", 438 + "SYNC_MGR_W_S", 439 + NULL 440 + }; 441 + 366 442 struct ecc_info_extract_params { 367 443 u64 block_address; 368 444 u32 num_memories; ··· 469 363 u32 size, u64 val); 470 364 static int gaudi_memset_registers(struct hl_device *hdev, u64 reg_base, 471 365 u32 num_regs, u32 val); 472 - static int gaudi_schedule_register_memset(struct hl_device *hdev, 473 - u32 hw_queue_id, u64 reg_base, u32 num_regs, u32 val); 474 366 static int gaudi_run_tpc_kernel(struct hl_device *hdev, u64 tpc_kernel, 475 367 u32 tpc_id); 476 368 static int gaudi_mmu_clear_pgt_range(struct hl_device *hdev); ··· 479 375 u32 size, bool eb); 480 376 static u32 gaudi_gen_wait_cb(struct hl_device *hdev, 481 377 struct hl_gen_wait_properties *prop); 482 - 483 378 static inline enum hl_collective_mode 484 379 get_collective_mode(struct hl_device *hdev, u32 queue_id) 485 380 { ··· 506 403 507 404 if (hdev->card_type == cpucp_card_type_pmc) { 508 405 prop->max_power_default = MAX_POWER_DEFAULT_PMC; 509 - prop->dc_power_default = DC_POWER_DEFAULT_PMC; 406 + 407 + if (prop->fw_security_enabled) 408 + prop->dc_power_default = DC_POWER_DEFAULT_PMC_SEC; 409 + else 410 + prop->dc_power_default = DC_POWER_DEFAULT_PMC; 510 411 } else { 511 412 prop->max_power_default = MAX_POWER_DEFAULT_PCI; 512 413 prop->dc_power_default = DC_POWER_DEFAULT_PCI; ··· 557 450 get_collective_mode(hdev, i); 558 451 } 559 452 453 + prop->device_dma_offset_for_host_access = HOST_PHYS_BASE; 560 454 prop->completion_queues_count = NUMBER_OF_CMPLT_QUEUES; 561 455 prop->collective_first_sob = 0; 562 456 prop->collective_first_mon = 0; ··· 658 550 prop->fw_cpu_boot_dev_sts1_valid = false; 659 551 prop->hard_reset_done_by_fw = false; 660 552 prop->gic_interrupts_enable = true; 553 + 554 + prop->server_type = HL_SERVER_TYPE_UNKNOWN; 661 555 662 556 return 0; 663 557 } ··· 833 723 GAUDI_BOOT_FIT_REQ_TIMEOUT_USEC); 834 724 if (rc) { 835 725 if (hdev->reset_on_preboot_fail) 836 - hdev->asic_funcs->hw_fini(hdev, true); 726 + hdev->asic_funcs->hw_fini(hdev, true, false); 837 727 goto pci_fini; 838 728 } 839 729 840 730 if (gaudi_get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) { 841 731 dev_info(hdev->dev, 842 732 "H/W state is dirty, must reset before initializing\n"); 843 - hdev->asic_funcs->hw_fini(hdev, true); 733 + hdev->asic_funcs->hw_fini(hdev, true, false); 844 734 } 845 735 846 736 return 0; ··· 1084 974 struct gaudi_hw_sob_group *hw_sob_group = 1085 975 container_of(ref, struct gaudi_hw_sob_group, kref); 1086 976 struct hl_device *hdev = hw_sob_group->hdev; 1087 - u64 base_addr; 1088 - int rc; 977 + int i; 1089 978 1090 - base_addr = CFG_BASE + mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0 + 1091 - hw_sob_group->base_sob_id * 4; 1092 - rc = gaudi_schedule_register_memset(hdev, hw_sob_group->queue_id, 1093 - base_addr, NUMBER_OF_SOBS_IN_GRP, 0); 1094 - if (rc) 1095 - dev_err(hdev->dev, 1096 - "failed resetting sob group - sob base %u, count %u", 1097 - hw_sob_group->base_sob_id, NUMBER_OF_SOBS_IN_GRP); 979 + for (i = 0 ; i < NUMBER_OF_SOBS_IN_GRP ; i++) 980 + WREG32((mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0 + 981 + (hw_sob_group->base_sob_id * 4) + (i * 4)), 0); 1098 982 1099 983 kref_init(&hw_sob_group->kref); 1100 984 } ··· 1225 1121 queue_id = job->hw_queue_id; 1226 1122 prop = &hdev->kernel_queues[queue_id].sync_stream_prop; 1227 1123 1124 + if (job->cs->encaps_signals) { 1125 + /* use the encaps signal handle store earlier in the flow 1126 + * and set the SOB information from the encaps 1127 + * signals handle 1128 + */ 1129 + hl_hw_queue_encaps_sig_set_sob_info(hdev, job->cs, job, 1130 + cs_cmpl); 1131 + 1132 + dev_dbg(hdev->dev, "collective wait: Sequence %llu found, sob_id: %u, wait for sob_val: %u\n", 1133 + job->cs->sequence, 1134 + cs_cmpl->hw_sob->sob_id, 1135 + cs_cmpl->sob_val); 1136 + } 1137 + 1228 1138 /* Add to wait CBs using slave monitor */ 1229 1139 wait_prop.data = (void *) job->user_cb; 1230 1140 wait_prop.sob_base = cs_cmpl->hw_sob->sob_id; ··· 1249 1131 wait_prop.size = cb_size; 1250 1132 1251 1133 dev_dbg(hdev->dev, 1252 - "Generate slave wait CB, sob %d, val:0x%x, mon %d, q %d\n", 1134 + "Generate slave wait CB, sob %d, val:%x, mon %d, q %d\n", 1253 1135 cs_cmpl->hw_sob->sob_id, cs_cmpl->sob_val, 1254 1136 prop->collective_slave_mon_id, queue_id); 1255 1137 ··· 1263 1145 prop->collective_sob_id, cb_size, false); 1264 1146 } 1265 1147 1266 - static void gaudi_collective_wait_init_cs(struct hl_cs *cs) 1148 + static int gaudi_collective_wait_init_cs(struct hl_cs *cs) 1267 1149 { 1268 1150 struct hl_cs_compl *signal_cs_cmpl = 1269 1151 container_of(cs->signal_fence, struct hl_cs_compl, base_fence); ··· 1281 1163 gaudi = hdev->asic_specific; 1282 1164 cprop = &gaudi->collective_props; 1283 1165 1284 - /* copy the SOB id and value of the signal CS */ 1285 - cs_cmpl->hw_sob = signal_cs_cmpl->hw_sob; 1286 - cs_cmpl->sob_val = signal_cs_cmpl->sob_val; 1166 + /* In encaps signals case the SOB info will be retrieved from 1167 + * the handle in gaudi_collective_slave_init_job. 1168 + */ 1169 + if (!cs->encaps_signals) { 1170 + /* copy the SOB id and value of the signal CS */ 1171 + cs_cmpl->hw_sob = signal_cs_cmpl->hw_sob; 1172 + cs_cmpl->sob_val = signal_cs_cmpl->sob_val; 1173 + } 1174 + 1175 + /* check again if the signal cs already completed. 1176 + * if yes then don't send any wait cs since the hw_sob 1177 + * could be in reset already. if signal is not completed 1178 + * then get refcount to hw_sob to prevent resetting the sob 1179 + * while wait cs is not submitted. 1180 + * note that this check is protected by two locks, 1181 + * hw queue lock and completion object lock, 1182 + * and the same completion object lock also protects 1183 + * the hw_sob reset handler function. 1184 + * The hw_queue lock prevent out of sync of hw_sob 1185 + * refcount value, changed by signal/wait flows. 1186 + */ 1187 + spin_lock(&signal_cs_cmpl->lock); 1188 + 1189 + if (completion_done(&cs->signal_fence->completion)) { 1190 + spin_unlock(&signal_cs_cmpl->lock); 1191 + return -EINVAL; 1192 + } 1193 + /* Increment kref since all slave queues are now waiting on it */ 1194 + kref_get(&cs_cmpl->hw_sob->kref); 1195 + 1196 + spin_unlock(&signal_cs_cmpl->lock); 1287 1197 1288 1198 /* Calculate the stream from collective master queue (1st job) */ 1289 1199 job = list_first_entry(&cs->job_list, struct hl_cs_job, cs_node); ··· 1356 1210 cprop->curr_sob_group_idx[stream], stream); 1357 1211 } 1358 1212 1359 - /* Increment kref since all slave queues are now waiting on it */ 1360 - kref_get(&cs_cmpl->hw_sob->kref); 1361 - /* 1362 - * Must put the signal fence after the SOB refcnt increment so 1363 - * the SOB refcnt won't turn 0 and reset the SOB before the 1364 - * wait CS was submitted. 1365 - */ 1366 1213 mb(); 1367 1214 hl_fence_put(cs->signal_fence); 1368 1215 cs->signal_fence = NULL; 1216 + 1217 + return 0; 1369 1218 } 1370 1219 1371 1220 static int gaudi_collective_wait_create_job(struct hl_device *hdev, 1372 1221 struct hl_ctx *ctx, struct hl_cs *cs, 1373 - enum hl_collective_mode mode, u32 queue_id, u32 wait_queue_id) 1222 + enum hl_collective_mode mode, u32 queue_id, u32 wait_queue_id, 1223 + u32 encaps_signal_offset) 1374 1224 { 1375 1225 struct hw_queue_properties *hw_queue_prop; 1376 1226 struct hl_cs_counters_atomic *cntr; ··· 1426 1284 job->user_cb_size = cb_size; 1427 1285 job->hw_queue_id = queue_id; 1428 1286 1287 + /* since its guaranteed to have only one chunk in the collective wait 1288 + * cs, we can use this chunk to set the encapsulated signal offset 1289 + * in the jobs. 1290 + */ 1291 + if (cs->encaps_signals) 1292 + job->encaps_sig_wait_offset = encaps_signal_offset; 1293 + 1429 1294 /* 1430 1295 * No need in parsing, user CB is the patched CB. 1431 1296 * We call hl_cb_destroy() out of two reasons - we don't need ··· 1461 1312 } 1462 1313 1463 1314 static int gaudi_collective_wait_create_jobs(struct hl_device *hdev, 1464 - struct hl_ctx *ctx, struct hl_cs *cs, u32 wait_queue_id, 1465 - u32 collective_engine_id) 1315 + struct hl_ctx *ctx, struct hl_cs *cs, 1316 + u32 wait_queue_id, u32 collective_engine_id, 1317 + u32 encaps_signal_offset) 1466 1318 { 1467 1319 struct gaudi_device *gaudi = hdev->asic_specific; 1468 1320 struct hw_queue_properties *hw_queue_prop; ··· 1513 1363 if (i == 0) { 1514 1364 queue_id = wait_queue_id; 1515 1365 rc = gaudi_collective_wait_create_job(hdev, ctx, cs, 1516 - HL_COLLECTIVE_MASTER, queue_id, wait_queue_id); 1366 + HL_COLLECTIVE_MASTER, queue_id, 1367 + wait_queue_id, encaps_signal_offset); 1517 1368 } else { 1518 1369 if (nic_idx < NIC_NUMBER_OF_ENGINES) { 1519 1370 if (gaudi->hw_cap_initialized & ··· 1534 1383 } 1535 1384 1536 1385 rc = gaudi_collective_wait_create_job(hdev, ctx, cs, 1537 - HL_COLLECTIVE_SLAVE, queue_id, wait_queue_id); 1386 + HL_COLLECTIVE_SLAVE, queue_id, 1387 + wait_queue_id, encaps_signal_offset); 1538 1388 } 1539 1389 1540 1390 if (rc) ··· 1583 1431 return rc; 1584 1432 } 1585 1433 1434 + /* Scrub both SRAM and DRAM */ 1435 + rc = hdev->asic_funcs->scrub_device_mem(hdev, 0, 0); 1436 + if (rc) 1437 + goto disable_pci_access; 1438 + 1586 1439 rc = gaudi_fetch_psoc_frequency(hdev); 1587 1440 if (rc) { 1588 1441 dev_err(hdev->dev, "Failed to fetch psoc frequency\n"); ··· 1611 1454 dev_err(hdev->dev, "Failed to init collective\n"); 1612 1455 goto disable_pci_access; 1613 1456 } 1457 + 1458 + /* We only support a single ASID for the user, so for the sake of optimization, just 1459 + * initialize the ASID one time during device initialization with the fixed value of 1 1460 + */ 1461 + gaudi_mmu_prepare(hdev, 1); 1614 1462 1615 1463 return 0; 1616 1464 ··· 1882 1720 hdev->supports_sync_stream = true; 1883 1721 hdev->supports_coresight = true; 1884 1722 hdev->supports_staged_submission = true; 1723 + hdev->supports_wait_for_multi_cs = true; 1885 1724 1886 - gaudi_set_pci_memory_regions(hdev); 1725 + hdev->asic_funcs->set_pci_memory_regions(hdev); 1726 + hdev->stream_master_qid_arr = 1727 + hdev->asic_funcs->get_stream_master_qid_arr(); 1728 + hdev->stream_master_qid_arr_size = GAUDI_STREAM_MASTER_ARR_SIZE; 1887 1729 1888 1730 return 0; 1889 1731 ··· 2689 2523 tpc_id < TPC_NUMBER_OF_ENGINES; 2690 2524 tpc_id++, tpc_offset += TPC_CFG_OFFSET) { 2691 2525 /* Mask all arithmetic interrupts from TPC */ 2692 - WREG32(mmTPC0_CFG_TPC_INTR_MASK + tpc_offset, 0x8FFF); 2526 + WREG32(mmTPC0_CFG_TPC_INTR_MASK + tpc_offset, 0x8FFE); 2693 2527 /* Set 16 cache lines */ 2694 2528 WREG32_FIELD(TPC0_CFG_MSS_CONFIG, tpc_offset, 2695 2529 ICACHE_FETCH_LINE_NUM, 2); ··· 3836 3670 WREG32(mmPSOC_TIMESTAMP_BASE - CFG_BASE, 0); 3837 3671 } 3838 3672 3839 - static void gaudi_halt_engines(struct hl_device *hdev, bool hard_reset) 3673 + static void gaudi_halt_engines(struct hl_device *hdev, bool hard_reset, bool fw_reset) 3840 3674 { 3841 3675 u32 wait_timeout_ms; 3842 3676 ··· 3847 3681 wait_timeout_ms = GAUDI_PLDM_RESET_WAIT_MSEC; 3848 3682 else 3849 3683 wait_timeout_ms = GAUDI_RESET_WAIT_MSEC; 3684 + 3685 + if (fw_reset) 3686 + goto skip_engines; 3850 3687 3851 3688 gaudi_stop_nic_qmans(hdev); 3852 3689 gaudi_stop_mme_qmans(hdev); ··· 3876 3707 3877 3708 gaudi_disable_timestamp(hdev); 3878 3709 3710 + skip_engines: 3879 3711 gaudi_disable_msi(hdev); 3880 3712 } 3881 3713 ··· 3908 3738 /* init MMU cache manage page */ 3909 3739 WREG32(mmSTLB_CACHE_INV_BASE_39_8, MMU_CACHE_MNG_ADDR >> 8); 3910 3740 WREG32(mmSTLB_CACHE_INV_BASE_49_40, MMU_CACHE_MNG_ADDR >> 40); 3741 + 3742 + /* mem cache invalidation */ 3743 + WREG32(mmSTLB_MEM_CACHE_INVALIDATION, 1); 3911 3744 3912 3745 hdev->asic_funcs->mmu_invalidate_cache(hdev, true, 0); 3913 3746 ··· 4244 4071 return rc; 4245 4072 } 4246 4073 4247 - static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset) 4074 + static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset) 4248 4075 { 4249 4076 struct cpu_dyn_regs *dyn_regs = 4250 4077 &hdev->fw_loader.dynamic_loader.comm_desc.cpu_dyn_regs; ··· 4263 4090 } else { 4264 4091 reset_timeout_ms = GAUDI_RESET_TIMEOUT_MSEC; 4265 4092 cpu_timeout_ms = GAUDI_CPU_RESET_WAIT_MSEC; 4093 + } 4094 + 4095 + if (fw_reset) { 4096 + dev_info(hdev->dev, 4097 + "Firmware performs HARD reset, going to wait %dms\n", 4098 + reset_timeout_ms); 4099 + 4100 + goto skip_reset; 4266 4101 } 4267 4102 4268 4103 driver_performs_reset = !!(!hdev->asic_prop.fw_security_enabled && ··· 4349 4168 reset_timeout_ms); 4350 4169 } 4351 4170 4171 + skip_reset: 4352 4172 /* 4353 4173 * After hard reset, we can't poll the BTM_FSM register because the PSOC 4354 4174 * itself is in reset. Need to wait until the reset is deasserted ··· 4394 4212 return gaudi_init_iatu(hdev); 4395 4213 } 4396 4214 4397 - static int gaudi_cb_mmap(struct hl_device *hdev, struct vm_area_struct *vma, 4215 + static int gaudi_mmap(struct hl_device *hdev, struct vm_area_struct *vma, 4398 4216 void *cpu_addr, dma_addr_t dma_addr, size_t size) 4399 4217 { 4400 4218 int rc; ··· 4803 4621 "Doing HBM scrubbing for 0x%09llx - 0x%09llx\n", 4804 4622 cur_addr, cur_addr + chunk_size); 4805 4623 4806 - WREG32(mmDMA0_CORE_SRC_BASE_LO + dma_offset, 0); 4807 - WREG32(mmDMA0_CORE_SRC_BASE_HI + dma_offset, 0); 4624 + WREG32(mmDMA0_CORE_SRC_BASE_LO + dma_offset, 0xdeadbeaf); 4625 + WREG32(mmDMA0_CORE_SRC_BASE_HI + dma_offset, 0xdeadbeaf); 4808 4626 WREG32(mmDMA0_CORE_DST_BASE_LO + dma_offset, 4809 4627 lower_32_bits(cur_addr)); 4810 4628 WREG32(mmDMA0_CORE_DST_BASE_HI + dma_offset, ··· 5978 5796 return rc; 5979 5797 } 5980 5798 5981 - static int gaudi_schedule_register_memset(struct hl_device *hdev, 5982 - u32 hw_queue_id, u64 reg_base, u32 num_regs, u32 val) 5983 - { 5984 - struct hl_ctx *ctx; 5985 - struct hl_pending_cb *pending_cb; 5986 - struct packet_msg_long *pkt; 5987 - u32 cb_size, ctl; 5988 - struct hl_cb *cb; 5989 - int i, rc; 5990 - 5991 - mutex_lock(&hdev->fpriv_list_lock); 5992 - ctx = hdev->compute_ctx; 5993 - 5994 - /* If no compute context available or context is going down 5995 - * memset registers directly 5996 - */ 5997 - if (!ctx || kref_read(&ctx->refcount) == 0) { 5998 - rc = gaudi_memset_registers(hdev, reg_base, num_regs, val); 5999 - mutex_unlock(&hdev->fpriv_list_lock); 6000 - return rc; 6001 - } 6002 - 6003 - mutex_unlock(&hdev->fpriv_list_lock); 6004 - 6005 - cb_size = (sizeof(*pkt) * num_regs) + 6006 - sizeof(struct packet_msg_prot) * 2; 6007 - 6008 - if (cb_size > SZ_2M) { 6009 - dev_err(hdev->dev, "CB size must be smaller than %uMB", SZ_2M); 6010 - return -ENOMEM; 6011 - } 6012 - 6013 - pending_cb = kzalloc(sizeof(*pending_cb), GFP_KERNEL); 6014 - if (!pending_cb) 6015 - return -ENOMEM; 6016 - 6017 - cb = hl_cb_kernel_create(hdev, cb_size, false); 6018 - if (!cb) { 6019 - kfree(pending_cb); 6020 - return -EFAULT; 6021 - } 6022 - 6023 - pkt = cb->kernel_address; 6024 - 6025 - ctl = FIELD_PREP(GAUDI_PKT_LONG_CTL_OP_MASK, 0); /* write the value */ 6026 - ctl |= FIELD_PREP(GAUDI_PKT_CTL_OPCODE_MASK, PACKET_MSG_LONG); 6027 - ctl |= FIELD_PREP(GAUDI_PKT_CTL_EB_MASK, 1); 6028 - ctl |= FIELD_PREP(GAUDI_PKT_CTL_RB_MASK, 1); 6029 - ctl |= FIELD_PREP(GAUDI_PKT_CTL_MB_MASK, 1); 6030 - 6031 - for (i = 0; i < num_regs ; i++, pkt++) { 6032 - pkt->ctl = cpu_to_le32(ctl); 6033 - pkt->value = cpu_to_le32(val); 6034 - pkt->addr = cpu_to_le64(reg_base + (i * 4)); 6035 - } 6036 - 6037 - hl_cb_destroy(hdev, &hdev->kernel_cb_mgr, cb->id << PAGE_SHIFT); 6038 - 6039 - pending_cb->cb = cb; 6040 - pending_cb->cb_size = cb_size; 6041 - /* The queue ID MUST be an external queue ID. Otherwise, we will 6042 - * have undefined behavior 6043 - */ 6044 - pending_cb->hw_queue_id = hw_queue_id; 6045 - 6046 - spin_lock(&ctx->pending_cb_lock); 6047 - list_add_tail(&pending_cb->cb_node, &ctx->pending_cb_list); 6048 - spin_unlock(&ctx->pending_cb_lock); 6049 - 6050 - return 0; 6051 - } 6052 - 6053 5799 static int gaudi_restore_sm_registers(struct hl_device *hdev) 6054 5800 { 6055 5801 u64 base_addr; ··· 6123 6013 6124 6014 static int gaudi_context_switch(struct hl_device *hdev, u32 asid) 6125 6015 { 6126 - return gaudi_restore_user_registers(hdev); 6016 + return 0; 6127 6017 } 6128 6018 6129 6019 static int gaudi_mmu_clear_pgt_range(struct hl_device *hdev) ··· 6833 6723 asid); 6834 6724 } 6835 6725 6726 + gaudi_mmu_prepare_reg(hdev, mmPSOC_GLOBAL_CONF_TRACE_ARUSER, asid); 6727 + gaudi_mmu_prepare_reg(hdev, mmPSOC_GLOBAL_CONF_TRACE_AWUSER, asid); 6728 + 6836 6729 hdev->asic_funcs->set_clock_gating(hdev); 6837 6730 6838 6731 mutex_unlock(&gaudi->clk_gate_mutex); ··· 6885 6772 6886 6773 dma_offset = gaudi_dma_assignment[GAUDI_PCI_DMA_1] * DMA_CORE_OFFSET; 6887 6774 6888 - WREG32_OR(mmDMA0_CORE_PROT + dma_offset, BIT(DMA0_CORE_PROT_VAL_SHIFT)); 6775 + WREG32(mmDMA0_CORE_PROT + dma_offset, 6776 + BIT(DMA0_CORE_PROT_ERR_VAL_SHIFT) | BIT(DMA0_CORE_PROT_VAL_SHIFT)); 6889 6777 6890 6778 rc = hl_hw_queue_send_cb_no_cmpl(hdev, GAUDI_QUEUE_ID_DMA_0_0, 6891 6779 job->job_cb_size, cb->bus_address); ··· 6907 6793 } 6908 6794 6909 6795 free_fence_ptr: 6910 - WREG32_AND(mmDMA0_CORE_PROT + dma_offset, 6911 - ~BIT(DMA0_CORE_PROT_VAL_SHIFT)); 6796 + WREG32(mmDMA0_CORE_PROT + dma_offset, BIT(DMA0_CORE_PROT_ERR_VAL_SHIFT)); 6912 6797 6913 6798 hdev->asic_funcs->asic_dma_pool_free(hdev, (void *) fence_ptr, 6914 6799 fence_dma_addr); ··· 7281 7168 7282 7169 cq_ptr = (((u64) RREG32(cq_ptr_hi)) << 32) | RREG32(cq_ptr_lo); 7283 7170 size = RREG32(cq_tsize); 7284 - dev_info(hdev->dev, "stop on err: stream: %u, addr: %#llx, size: %x\n", 7171 + dev_info(hdev->dev, "stop on err: stream: %u, addr: %#llx, size: %u\n", 7285 7172 stream, cq_ptr, size); 7286 7173 } 7287 7174 ··· 7337 7224 7338 7225 addr = le64_to_cpu(bd->ptr); 7339 7226 7340 - dev_info(hdev->dev, "stop on err PQE(stream %u): ci: %u, addr: %#llx, size: %x\n", 7227 + dev_info(hdev->dev, "stop on err PQE(stream %u): ci: %u, addr: %#llx, size: %u\n", 7341 7228 stream, ci, addr, len); 7342 7229 7343 7230 /* get previous ci, wrap if needed */ ··· 7439 7326 { 7440 7327 u32 index = event_type - GAUDI_EVENT_DMA_IF_SEI_0; 7441 7328 7329 + /* Flip the bits as the enum is ordered in the opposite way */ 7330 + index = (index ^ 0x3) & 0x3; 7331 + 7442 7332 switch (sei_data->sei_cause) { 7443 7333 case SM_SEI_SO_OVERFLOW: 7444 - dev_err(hdev->dev, 7445 - "SM %u SEI Error: SO %u overflow/underflow", 7446 - index, le32_to_cpu(sei_data->sei_log)); 7334 + dev_err_ratelimited(hdev->dev, 7335 + "%s SEI Error: SOB Group %u overflow/underflow", 7336 + gaudi_sync_manager_names[index], 7337 + le32_to_cpu(sei_data->sei_log)); 7447 7338 break; 7448 7339 case SM_SEI_LBW_4B_UNALIGNED: 7449 - dev_err(hdev->dev, 7450 - "SM %u SEI Error: Unaligned 4B LBW access, monitor agent address low - %#x", 7451 - index, le32_to_cpu(sei_data->sei_log)); 7340 + dev_err_ratelimited(hdev->dev, 7341 + "%s SEI Error: Unaligned 4B LBW access, monitor agent address low - %#x", 7342 + gaudi_sync_manager_names[index], 7343 + le32_to_cpu(sei_data->sei_log)); 7452 7344 break; 7453 7345 case SM_SEI_AXI_RESPONSE_ERR: 7454 - dev_err(hdev->dev, 7455 - "SM %u SEI Error: AXI ID %u response error", 7456 - index, le32_to_cpu(sei_data->sei_log)); 7346 + dev_err_ratelimited(hdev->dev, 7347 + "%s SEI Error: AXI ID %u response error", 7348 + gaudi_sync_manager_names[index], 7349 + le32_to_cpu(sei_data->sei_log)); 7457 7350 break; 7458 7351 default: 7459 - dev_err(hdev->dev, "Unknown SM SEI cause %u", 7352 + dev_err_ratelimited(hdev->dev, "Unknown SM SEI cause %u", 7460 7353 le32_to_cpu(sei_data->sei_log)); 7461 7354 break; 7462 7355 } ··· 7476 7357 u8 index, memory_wrapper_idx = 0; 7477 7358 bool extract_info_from_fw; 7478 7359 int rc; 7360 + 7361 + if (hdev->asic_prop.fw_security_enabled) { 7362 + extract_info_from_fw = true; 7363 + goto extract_ecc_info; 7364 + } 7479 7365 7480 7366 switch (event_type) { 7481 7367 case GAUDI_EVENT_PCIE_CORE_SERR ... GAUDI_EVENT_PCIE_PHY_DERR: ··· 7554 7430 return; 7555 7431 } 7556 7432 7433 + extract_ecc_info: 7557 7434 if (extract_info_from_fw) { 7558 7435 ecc_address = le64_to_cpu(ecc_data->ecc_address); 7559 7436 ecc_syndrom = le64_to_cpu(ecc_data->ecc_syndrom); ··· 7931 7806 u32 ctl = le32_to_cpu(eq_entry->hdr.ctl); 7932 7807 u16 event_type = ((ctl & EQ_CTL_EVENT_TYPE_MASK) 7933 7808 >> EQ_CTL_EVENT_TYPE_SHIFT); 7934 - u8 cause; 7935 7809 bool reset_required; 7810 + u8 cause; 7811 + int rc; 7812 + 7813 + if (event_type >= GAUDI_EVENT_SIZE) { 7814 + dev_err(hdev->dev, "Event type %u exceeds maximum of %u", 7815 + event_type, GAUDI_EVENT_SIZE - 1); 7816 + return; 7817 + } 7936 7818 7937 7819 gaudi->events_stat[event_type]++; 7938 7820 gaudi->events_stat_aggregate[event_type]++; ··· 8012 7880 tpc_dec_event_to_tpc_id(event_type), 8013 7881 "AXI_SLV_DEC_Error"); 8014 7882 if (reset_required) { 8015 - dev_err(hdev->dev, "hard reset required due to %s\n", 7883 + dev_err(hdev->dev, "reset required due to %s\n", 8016 7884 gaudi_irq_map_table[event_type].name); 8017 7885 8018 - goto reset_device; 7886 + hl_device_reset(hdev, 0); 8019 7887 } else { 8020 7888 hl_fw_unmask_irq(hdev, event_type); 8021 7889 } ··· 8034 7902 tpc_krn_event_to_tpc_id(event_type), 8035 7903 "KRN_ERR"); 8036 7904 if (reset_required) { 8037 - dev_err(hdev->dev, "hard reset required due to %s\n", 7905 + dev_err(hdev->dev, "reset required due to %s\n", 8038 7906 gaudi_irq_map_table[event_type].name); 8039 7907 8040 - goto reset_device; 7908 + hl_device_reset(hdev, 0); 8041 7909 } else { 8042 7910 hl_fw_unmask_irq(hdev, event_type); 8043 7911 } ··· 8125 7993 gaudi_print_irq_info(hdev, event_type, false); 8126 7994 gaudi_print_sm_sei_info(hdev, event_type, 8127 7995 &eq_entry->sm_sei_data); 7996 + rc = hl_state_dump(hdev); 7997 + if (rc) 7998 + dev_err(hdev->dev, 7999 + "Error during system state dump %d\n", rc); 8128 8000 hl_fw_unmask_irq(hdev, event_type); 8129 8001 break; 8130 8002 ··· 8167 8031 return; 8168 8032 8169 8033 reset_device: 8170 - if (hdev->hard_reset_on_fw_events) 8034 + if (hdev->asic_prop.fw_security_enabled) 8035 + hl_device_reset(hdev, HL_RESET_HARD | HL_RESET_FW); 8036 + else if (hdev->hard_reset_on_fw_events) 8171 8037 hl_device_reset(hdev, HL_RESET_HARD); 8172 8038 else 8173 8039 hl_fw_unmask_irq(hdev, event_type); ··· 8701 8563 8702 8564 static int gaudi_ctx_init(struct hl_ctx *ctx) 8703 8565 { 8566 + int rc; 8567 + 8704 8568 if (ctx->asid == HL_KERNEL_ASID_ID) 8705 8569 return 0; 8706 8570 8707 - gaudi_mmu_prepare(ctx->hdev, ctx->asid); 8708 - return gaudi_internal_cb_pool_init(ctx->hdev, ctx); 8571 + rc = gaudi_internal_cb_pool_init(ctx->hdev, ctx); 8572 + if (rc) 8573 + return rc; 8574 + 8575 + rc = gaudi_restore_user_registers(ctx->hdev); 8576 + if (rc) 8577 + gaudi_internal_cb_pool_fini(ctx->hdev, ctx); 8578 + 8579 + return rc; 8709 8580 } 8710 8581 8711 8582 static void gaudi_ctx_fini(struct hl_ctx *ctx) ··· 8741 8594 return sizeof(struct packet_msg_short) * 4 + 8742 8595 sizeof(struct packet_fence) + 8743 8596 sizeof(struct packet_msg_prot) * 2; 8597 + } 8598 + 8599 + static u32 gaudi_get_sob_addr(struct hl_device *hdev, u32 sob_id) 8600 + { 8601 + return mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0 + (sob_id * 4); 8744 8602 } 8745 8603 8746 8604 static u32 gaudi_gen_signal_cb(struct hl_device *hdev, void *data, u16 sob_id, ··· 9054 8902 static void gaudi_reset_sob(struct hl_device *hdev, void *data) 9055 8903 { 9056 8904 struct hl_hw_sob *hw_sob = (struct hl_hw_sob *) data; 9057 - int rc; 9058 8905 9059 8906 dev_dbg(hdev->dev, "reset SOB, q_idx: %d, sob_id: %d\n", hw_sob->q_idx, 9060 8907 hw_sob->sob_id); 9061 8908 9062 - rc = gaudi_schedule_register_memset(hdev, hw_sob->q_idx, 9063 - CFG_BASE + mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0 + 9064 - hw_sob->sob_id * 4, 1, 0); 9065 - if (rc) 9066 - dev_err(hdev->dev, "failed resetting sob %u", hw_sob->sob_id); 8909 + WREG32(mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0 + 8910 + hw_sob->sob_id * 4, 0); 9067 8911 9068 8912 kref_init(&hw_sob->kref); 9069 8913 } ··· 9125 8977 } 9126 8978 } 9127 8979 8980 + static int gaudi_add_sync_to_engine_map_entry( 8981 + struct hl_sync_to_engine_map *map, u32 reg_value, 8982 + enum hl_sync_engine_type engine_type, u32 engine_id) 8983 + { 8984 + struct hl_sync_to_engine_map_entry *entry; 8985 + 8986 + /* Reg value represents a partial address of sync object, 8987 + * it is used as unique identifier. For this we need to 8988 + * clear the cutoff cfg base bits from the value. 8989 + */ 8990 + if (reg_value == 0 || reg_value == 0xffffffff) 8991 + return 0; 8992 + reg_value -= (u32)CFG_BASE; 8993 + 8994 + /* create a new hash entry */ 8995 + entry = kzalloc(sizeof(*entry), GFP_KERNEL); 8996 + if (!entry) 8997 + return -ENOMEM; 8998 + entry->engine_type = engine_type; 8999 + entry->engine_id = engine_id; 9000 + entry->sync_id = reg_value; 9001 + hash_add(map->tb, &entry->node, reg_value); 9002 + 9003 + return 0; 9004 + } 9005 + 9006 + static int gaudi_gen_sync_to_engine_map(struct hl_device *hdev, 9007 + struct hl_sync_to_engine_map *map) 9008 + { 9009 + struct hl_state_dump_specs *sds = &hdev->state_dump_specs; 9010 + struct gaudi_device *gaudi = hdev->asic_specific; 9011 + int i, j, rc; 9012 + u32 reg_value; 9013 + 9014 + /* Iterate over TPC engines */ 9015 + for (i = 0; i < sds->props[SP_NUM_OF_TPC_ENGINES]; ++i) { 9016 + /* TPC registered must be accessed with clock gating disabled */ 9017 + mutex_lock(&gaudi->clk_gate_mutex); 9018 + hdev->asic_funcs->disable_clock_gating(hdev); 9019 + 9020 + reg_value = RREG32(sds->props[SP_TPC0_CFG_SO] + 9021 + sds->props[SP_NEXT_TPC] * i); 9022 + 9023 + /* We can reenable clock_gating */ 9024 + hdev->asic_funcs->set_clock_gating(hdev); 9025 + mutex_unlock(&gaudi->clk_gate_mutex); 9026 + 9027 + rc = gaudi_add_sync_to_engine_map_entry(map, reg_value, 9028 + ENGINE_TPC, i); 9029 + if (rc) 9030 + goto free_sync_to_engine_map; 9031 + } 9032 + 9033 + /* Iterate over MME engines */ 9034 + for (i = 0; i < sds->props[SP_NUM_OF_MME_ENGINES]; ++i) { 9035 + for (j = 0; j < sds->props[SP_SUB_MME_ENG_NUM]; ++j) { 9036 + /* MME registered must be accessed with clock gating 9037 + * disabled 9038 + */ 9039 + mutex_lock(&gaudi->clk_gate_mutex); 9040 + hdev->asic_funcs->disable_clock_gating(hdev); 9041 + 9042 + reg_value = RREG32(sds->props[SP_MME_CFG_SO] + 9043 + sds->props[SP_NEXT_MME] * i + 9044 + j * sizeof(u32)); 9045 + 9046 + /* We can reenable clock_gating */ 9047 + hdev->asic_funcs->set_clock_gating(hdev); 9048 + mutex_unlock(&gaudi->clk_gate_mutex); 9049 + 9050 + rc = gaudi_add_sync_to_engine_map_entry( 9051 + map, reg_value, ENGINE_MME, 9052 + i * sds->props[SP_SUB_MME_ENG_NUM] + j); 9053 + if (rc) 9054 + goto free_sync_to_engine_map; 9055 + } 9056 + } 9057 + 9058 + /* Iterate over DMA engines */ 9059 + for (i = 0; i < sds->props[SP_NUM_OF_DMA_ENGINES]; ++i) { 9060 + reg_value = RREG32(sds->props[SP_DMA_CFG_SO] + 9061 + sds->props[SP_DMA_QUEUES_OFFSET] * i); 9062 + rc = gaudi_add_sync_to_engine_map_entry(map, reg_value, 9063 + ENGINE_DMA, i); 9064 + if (rc) 9065 + goto free_sync_to_engine_map; 9066 + } 9067 + 9068 + return 0; 9069 + 9070 + free_sync_to_engine_map: 9071 + hl_state_dump_free_sync_to_engine_map(map); 9072 + 9073 + return rc; 9074 + } 9075 + 9076 + static int gaudi_monitor_valid(struct hl_mon_state_dump *mon) 9077 + { 9078 + return FIELD_GET( 9079 + SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_STATUS_0_VALID_MASK, 9080 + mon->status); 9081 + } 9082 + 9083 + static void gaudi_fill_sobs_from_mon(char *sobs, struct hl_mon_state_dump *mon) 9084 + { 9085 + const size_t max_write = 10; 9086 + u32 gid, mask, sob; 9087 + int i, offset; 9088 + 9089 + /* Sync object ID is calculated as follows: 9090 + * (8 * group_id + cleared bits in mask) 9091 + */ 9092 + gid = FIELD_GET(SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_SID_MASK, 9093 + mon->arm_data); 9094 + mask = FIELD_GET(SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_MASK_MASK, 9095 + mon->arm_data); 9096 + 9097 + for (i = 0, offset = 0; mask && offset < MONITOR_SOB_STRING_SIZE - 9098 + max_write; mask >>= 1, i++) { 9099 + if (!(mask & 1)) { 9100 + sob = gid * MONITOR_MAX_SOBS + i; 9101 + 9102 + if (offset > 0) 9103 + offset += snprintf(sobs + offset, max_write, 9104 + ", "); 9105 + 9106 + offset += snprintf(sobs + offset, max_write, "%u", sob); 9107 + } 9108 + } 9109 + } 9110 + 9111 + static int gaudi_print_single_monitor(char **buf, size_t *size, size_t *offset, 9112 + struct hl_device *hdev, 9113 + struct hl_mon_state_dump *mon) 9114 + { 9115 + const char *name; 9116 + char scratch_buf1[BIN_REG_STRING_SIZE], 9117 + scratch_buf2[BIN_REG_STRING_SIZE]; 9118 + char monitored_sobs[MONITOR_SOB_STRING_SIZE] = {0}; 9119 + 9120 + name = hl_state_dump_get_monitor_name(hdev, mon); 9121 + if (!name) 9122 + name = ""; 9123 + 9124 + gaudi_fill_sobs_from_mon(monitored_sobs, mon); 9125 + 9126 + return hl_snprintf_resize( 9127 + buf, size, offset, 9128 + "Mon id: %u%s, wait for group id: %u mask %s to reach val: %u and write %u to address 0x%llx. Pending: %s. Means sync objects [%s] are being monitored.", 9129 + mon->id, name, 9130 + FIELD_GET(SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_SID_MASK, 9131 + mon->arm_data), 9132 + hl_format_as_binary( 9133 + scratch_buf1, sizeof(scratch_buf1), 9134 + FIELD_GET( 9135 + SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_MASK_MASK, 9136 + mon->arm_data)), 9137 + FIELD_GET(SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_SOD_MASK, 9138 + mon->arm_data), 9139 + mon->wr_data, 9140 + (((u64)mon->wr_addr_high) << 32) | mon->wr_addr_low, 9141 + hl_format_as_binary( 9142 + scratch_buf2, sizeof(scratch_buf2), 9143 + FIELD_GET( 9144 + SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_STATUS_0_PENDING_MASK, 9145 + mon->status)), 9146 + monitored_sobs); 9147 + } 9148 + 9149 + 9150 + static int gaudi_print_fences_single_engine( 9151 + struct hl_device *hdev, u64 base_offset, u64 status_base_offset, 9152 + enum hl_sync_engine_type engine_type, u32 engine_id, char **buf, 9153 + size_t *size, size_t *offset) 9154 + { 9155 + struct hl_state_dump_specs *sds = &hdev->state_dump_specs; 9156 + int rc = -ENOMEM, i; 9157 + u32 *statuses, *fences; 9158 + 9159 + statuses = kcalloc(sds->props[SP_ENGINE_NUM_OF_QUEUES], 9160 + sizeof(*statuses), GFP_KERNEL); 9161 + if (!statuses) 9162 + goto out; 9163 + 9164 + fences = kcalloc(sds->props[SP_ENGINE_NUM_OF_FENCES] * 9165 + sds->props[SP_ENGINE_NUM_OF_QUEUES], 9166 + sizeof(*fences), GFP_KERNEL); 9167 + if (!fences) 9168 + goto free_status; 9169 + 9170 + for (i = 0; i < sds->props[SP_ENGINE_NUM_OF_FENCES]; ++i) 9171 + statuses[i] = RREG32(status_base_offset + i * sizeof(u32)); 9172 + 9173 + for (i = 0; i < sds->props[SP_ENGINE_NUM_OF_FENCES] * 9174 + sds->props[SP_ENGINE_NUM_OF_QUEUES]; ++i) 9175 + fences[i] = RREG32(base_offset + i * sizeof(u32)); 9176 + 9177 + /* The actual print */ 9178 + for (i = 0; i < sds->props[SP_ENGINE_NUM_OF_QUEUES]; ++i) { 9179 + u32 fence_id; 9180 + u64 fence_cnt, fence_rdata; 9181 + const char *engine_name; 9182 + 9183 + if (!FIELD_GET(TPC0_QM_CP_STS_0_FENCE_IN_PROGRESS_MASK, 9184 + statuses[i])) 9185 + continue; 9186 + 9187 + fence_id = 9188 + FIELD_GET(TPC0_QM_CP_STS_0_FENCE_ID_MASK, statuses[i]); 9189 + fence_cnt = base_offset + CFG_BASE + 9190 + sizeof(u32) * 9191 + (i + fence_id * sds->props[SP_ENGINE_NUM_OF_QUEUES]); 9192 + fence_rdata = fence_cnt - sds->props[SP_FENCE0_CNT_OFFSET] + 9193 + sds->props[SP_FENCE0_RDATA_OFFSET]; 9194 + engine_name = hl_sync_engine_to_string(engine_type); 9195 + 9196 + rc = hl_snprintf_resize( 9197 + buf, size, offset, 9198 + "%s%u, stream %u: fence id %u cnt = 0x%llx (%s%u_QM.CP_FENCE%u_CNT_%u) rdata = 0x%llx (%s%u_QM.CP_FENCE%u_RDATA_%u) value = %u, cp_status = %u\n", 9199 + engine_name, engine_id, 9200 + i, fence_id, 9201 + fence_cnt, engine_name, engine_id, fence_id, i, 9202 + fence_rdata, engine_name, engine_id, fence_id, i, 9203 + fences[fence_id], 9204 + statuses[i]); 9205 + if (rc) 9206 + goto free_fences; 9207 + } 9208 + 9209 + rc = 0; 9210 + 9211 + free_fences: 9212 + kfree(fences); 9213 + free_status: 9214 + kfree(statuses); 9215 + out: 9216 + return rc; 9217 + } 9218 + 9219 + 9220 + static struct hl_state_dump_specs_funcs gaudi_state_dump_funcs = { 9221 + .monitor_valid = gaudi_monitor_valid, 9222 + .print_single_monitor = gaudi_print_single_monitor, 9223 + .gen_sync_to_engine_map = gaudi_gen_sync_to_engine_map, 9224 + .print_fences_single_engine = gaudi_print_fences_single_engine, 9225 + }; 9226 + 9227 + static void gaudi_state_dump_init(struct hl_device *hdev) 9228 + { 9229 + struct hl_state_dump_specs *sds = &hdev->state_dump_specs; 9230 + int i; 9231 + 9232 + for (i = 0; i < ARRAY_SIZE(gaudi_so_id_to_str); ++i) 9233 + hash_add(sds->so_id_to_str_tb, 9234 + &gaudi_so_id_to_str[i].node, 9235 + gaudi_so_id_to_str[i].id); 9236 + 9237 + for (i = 0; i < ARRAY_SIZE(gaudi_monitor_id_to_str); ++i) 9238 + hash_add(sds->monitor_id_to_str_tb, 9239 + &gaudi_monitor_id_to_str[i].node, 9240 + gaudi_monitor_id_to_str[i].id); 9241 + 9242 + sds->props = gaudi_state_dump_specs_props; 9243 + 9244 + sds->sync_namager_names = gaudi_sync_manager_names; 9245 + 9246 + sds->funcs = gaudi_state_dump_funcs; 9247 + } 9248 + 9249 + static u32 *gaudi_get_stream_master_qid_arr(void) 9250 + { 9251 + return gaudi_stream_master; 9252 + } 9253 + 9128 9254 static const struct hl_asic_funcs gaudi_funcs = { 9129 9255 .early_init = gaudi_early_init, 9130 9256 .early_fini = gaudi_early_fini, ··· 9411 8989 .halt_engines = gaudi_halt_engines, 9412 8990 .suspend = gaudi_suspend, 9413 8991 .resume = gaudi_resume, 9414 - .cb_mmap = gaudi_cb_mmap, 8992 + .mmap = gaudi_mmap, 9415 8993 .ring_doorbell = gaudi_ring_doorbell, 9416 8994 .pqe_write = gaudi_pqe_write, 9417 8995 .asic_dma_alloc_coherent = gaudi_dma_alloc_coherent, ··· 9484 9062 .enable_events_from_fw = gaudi_enable_events_from_fw, 9485 9063 .map_pll_idx_to_fw_idx = gaudi_map_pll_idx_to_fw_idx, 9486 9064 .init_firmware_loader = gaudi_init_firmware_loader, 9487 - .init_cpu_scrambler_dram = gaudi_init_scrambler_hbm 9065 + .init_cpu_scrambler_dram = gaudi_init_scrambler_hbm, 9066 + .state_dump_init = gaudi_state_dump_init, 9067 + .get_sob_addr = gaudi_get_sob_addr, 9068 + .set_pci_memory_regions = gaudi_set_pci_memory_regions, 9069 + .get_stream_master_qid_arr = gaudi_get_stream_master_qid_arr 9488 9070 }; 9489 9071 9490 9072 /**

+18 -1

drivers/misc/habanalabs/gaudi/gaudiP.h

··· 36 36 #define NUMBER_OF_INTERRUPTS (NUMBER_OF_CMPLT_QUEUES + \ 37 37 NUMBER_OF_CPU_HW_QUEUES) 38 38 39 + #define GAUDI_STREAM_MASTER_ARR_SIZE 8 40 + 39 41 #if (NUMBER_OF_INTERRUPTS > GAUDI_MSI_ENTRIES) 40 42 #error "Number of MSI interrupts must be smaller or equal to GAUDI_MSI_ENTRIES" 41 43 #endif ··· 52 50 #define DC_POWER_DEFAULT_PCI 60000 /* 60W */ 53 51 #define DC_POWER_DEFAULT_PMC 60000 /* 60W */ 54 52 53 + #define DC_POWER_DEFAULT_PMC_SEC 97000 /* 97W */ 54 + 55 55 #define GAUDI_CPU_TIMEOUT_USEC 30000000 /* 30s */ 56 56 57 57 #define TPC_ENABLED_MASK 0xFF ··· 66 62 67 63 #define DMA_MAX_TRANSFER_SIZE U32_MAX 68 64 69 - #define GAUDI_DEFAULT_CARD_NAME "HL2000" 65 + #define GAUDI_DEFAULT_CARD_NAME "HL205" 70 66 71 67 #define GAUDI_MAX_PENDING_CS SZ_16K 72 68 ··· 121 117 (((mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_STATUS_511 - \ 122 118 mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_STATUS_0) + 4) >> 2) 123 119 120 + #define MONITOR_MAX_SOBS 8 124 121 125 122 /* DRAM Memory Map */ 126 123 ··· 204 199 #define HW_CAP_TPC7 BIT(31) 205 200 #define HW_CAP_TPC_MASK GENMASK(31, 24) 206 201 #define HW_CAP_TPC_SHIFT 24 202 + 203 + #define NEXT_SYNC_OBJ_ADDR_INTERVAL \ 204 + (mmSYNC_MNGR_W_N_SYNC_MNGR_OBJS_SOB_OBJ_0 - \ 205 + mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_SOB_OBJ_0) 206 + #define NUM_OF_MME_ENGINES 2 207 + #define NUM_OF_MME_SUB_ENGINES 2 208 + #define NUM_OF_TPC_ENGINES 8 209 + #define NUM_OF_DMA_ENGINES 8 210 + #define NUM_OF_QUEUES 5 211 + #define NUM_OF_STREAMS 4 212 + #define NUM_OF_FENCES 4 213 + 207 214 208 215 #define GAUDI_CPU_PCI_MSB_ADDR(addr) (((addr) & GENMASK_ULL(49, 39)) >> 39) 209 216 #define GAUDI_PCI_TO_CPU_ADDR(addr) \

-5

drivers/misc/habanalabs/gaudi/gaudi_coresight.c

··· 622 622 return -EINVAL; 623 623 } 624 624 625 - gaudi_mmu_prepare_reg(hdev, mmPSOC_GLOBAL_CONF_TRACE_ARUSER, 626 - hdev->compute_ctx->asid); 627 - gaudi_mmu_prepare_reg(hdev, mmPSOC_GLOBAL_CONF_TRACE_AWUSER, 628 - hdev->compute_ctx->asid); 629 - 630 625 msb = upper_32_bits(input->buffer_address) >> 8; 631 626 msb &= PSOC_GLOBAL_CONF_TRACE_ADDR_MSB_MASK; 632 627 WREG32(mmPSOC_GLOBAL_CONF_TRACE_ADDR, msb);

+8

drivers/misc/habanalabs/gaudi/gaudi_security.c

··· 9559 9559 mask |= 1U << ((mmTPC0_CFG_CFG_BASE_ADDRESS_HIGH & 0x7F) >> 2); 9560 9560 mask |= 1U << ((mmTPC0_CFG_CFG_SUBTRACT_VALUE & 0x7F) >> 2); 9561 9561 mask |= 1U << ((mmTPC0_CFG_TPC_STALL & 0x7F) >> 2); 9562 + mask |= 1U << ((mmTPC0_CFG_ICACHE_BASE_ADDERESS_HIGH & 0x7F) >> 2); 9562 9563 mask |= 1U << ((mmTPC0_CFG_RD_RATE_LIMIT & 0x7F) >> 2); 9563 9564 mask |= 1U << ((mmTPC0_CFG_WR_RATE_LIMIT & 0x7F) >> 2); 9564 9565 mask |= 1U << ((mmTPC0_CFG_MSS_CONFIG & 0x7F) >> 2); ··· 10014 10013 mask |= 1U << ((mmTPC1_CFG_CFG_BASE_ADDRESS_HIGH & 0x7F) >> 2); 10015 10014 mask |= 1U << ((mmTPC1_CFG_CFG_SUBTRACT_VALUE & 0x7F) >> 2); 10016 10015 mask |= 1U << ((mmTPC1_CFG_TPC_STALL & 0x7F) >> 2); 10016 + mask |= 1U << ((mmTPC1_CFG_ICACHE_BASE_ADDERESS_HIGH & 0x7F) >> 2); 10017 10017 mask |= 1U << ((mmTPC1_CFG_RD_RATE_LIMIT & 0x7F) >> 2); 10018 10018 mask |= 1U << ((mmTPC1_CFG_WR_RATE_LIMIT & 0x7F) >> 2); 10019 10019 mask |= 1U << ((mmTPC1_CFG_MSS_CONFIG & 0x7F) >> 2); ··· 10468 10466 mask |= 1U << ((mmTPC2_CFG_CFG_BASE_ADDRESS_HIGH & 0x7F) >> 2); 10469 10467 mask |= 1U << ((mmTPC2_CFG_CFG_SUBTRACT_VALUE & 0x7F) >> 2); 10470 10468 mask |= 1U << ((mmTPC2_CFG_TPC_STALL & 0x7F) >> 2); 10469 + mask |= 1U << ((mmTPC2_CFG_ICACHE_BASE_ADDERESS_HIGH & 0x7F) >> 2); 10471 10470 mask |= 1U << ((mmTPC2_CFG_RD_RATE_LIMIT & 0x7F) >> 2); 10472 10471 mask |= 1U << ((mmTPC2_CFG_WR_RATE_LIMIT & 0x7F) >> 2); 10473 10472 mask |= 1U << ((mmTPC2_CFG_MSS_CONFIG & 0x7F) >> 2); ··· 10922 10919 mask |= 1U << ((mmTPC3_CFG_CFG_BASE_ADDRESS_HIGH & 0x7F) >> 2); 10923 10920 mask |= 1U << ((mmTPC3_CFG_CFG_SUBTRACT_VALUE & 0x7F) >> 2); 10924 10921 mask |= 1U << ((mmTPC3_CFG_TPC_STALL & 0x7F) >> 2); 10922 + mask |= 1U << ((mmTPC3_CFG_ICACHE_BASE_ADDERESS_HIGH & 0x7F) >> 2); 10925 10923 mask |= 1U << ((mmTPC3_CFG_RD_RATE_LIMIT & 0x7F) >> 2); 10926 10924 mask |= 1U << ((mmTPC3_CFG_WR_RATE_LIMIT & 0x7F) >> 2); 10927 10925 mask |= 1U << ((mmTPC3_CFG_MSS_CONFIG & 0x7F) >> 2); ··· 11376 11372 mask |= 1U << ((mmTPC4_CFG_CFG_BASE_ADDRESS_HIGH & 0x7F) >> 2); 11377 11373 mask |= 1U << ((mmTPC4_CFG_CFG_SUBTRACT_VALUE & 0x7F) >> 2); 11378 11374 mask |= 1U << ((mmTPC4_CFG_TPC_STALL & 0x7F) >> 2); 11375 + mask |= 1U << ((mmTPC4_CFG_ICACHE_BASE_ADDERESS_HIGH & 0x7F) >> 2); 11379 11376 mask |= 1U << ((mmTPC4_CFG_RD_RATE_LIMIT & 0x7F) >> 2); 11380 11377 mask |= 1U << ((mmTPC4_CFG_WR_RATE_LIMIT & 0x7F) >> 2); 11381 11378 mask |= 1U << ((mmTPC4_CFG_MSS_CONFIG & 0x7F) >> 2); ··· 11830 11825 mask |= 1U << ((mmTPC5_CFG_CFG_BASE_ADDRESS_HIGH & 0x7F) >> 2); 11831 11826 mask |= 1U << ((mmTPC5_CFG_CFG_SUBTRACT_VALUE & 0x7F) >> 2); 11832 11827 mask |= 1U << ((mmTPC5_CFG_TPC_STALL & 0x7F) >> 2); 11828 + mask |= 1U << ((mmTPC5_CFG_ICACHE_BASE_ADDERESS_HIGH & 0x7F) >> 2); 11833 11829 mask |= 1U << ((mmTPC5_CFG_RD_RATE_LIMIT & 0x7F) >> 2); 11834 11830 mask |= 1U << ((mmTPC5_CFG_WR_RATE_LIMIT & 0x7F) >> 2); 11835 11831 mask |= 1U << ((mmTPC5_CFG_MSS_CONFIG & 0x7F) >> 2); ··· 12286 12280 mask |= 1U << ((mmTPC6_CFG_CFG_BASE_ADDRESS_HIGH & 0x7F) >> 2); 12287 12281 mask |= 1U << ((mmTPC6_CFG_CFG_SUBTRACT_VALUE & 0x7F) >> 2); 12288 12282 mask |= 1U << ((mmTPC6_CFG_TPC_STALL & 0x7F) >> 2); 12283 + mask |= 1U << ((mmTPC6_CFG_ICACHE_BASE_ADDERESS_HIGH & 0x7F) >> 2); 12289 12284 mask |= 1U << ((mmTPC6_CFG_RD_RATE_LIMIT & 0x7F) >> 2); 12290 12285 mask |= 1U << ((mmTPC6_CFG_WR_RATE_LIMIT & 0x7F) >> 2); 12291 12286 mask |= 1U << ((mmTPC6_CFG_MSS_CONFIG & 0x7F) >> 2); ··· 12742 12735 mask |= 1U << ((mmTPC7_CFG_CFG_BASE_ADDRESS_HIGH & 0x7F) >> 2); 12743 12736 mask |= 1U << ((mmTPC7_CFG_CFG_SUBTRACT_VALUE & 0x7F) >> 2); 12744 12737 mask |= 1U << ((mmTPC7_CFG_TPC_STALL & 0x7F) >> 2); 12738 + mask |= 1U << ((mmTPC7_CFG_ICACHE_BASE_ADDERESS_HIGH & 0x7F) >> 2); 12745 12739 mask |= 1U << ((mmTPC7_CFG_RD_RATE_LIMIT & 0x7F) >> 2); 12746 12740 mask |= 1U << ((mmTPC7_CFG_WR_RATE_LIMIT & 0x7F) >> 2); 12747 12741 mask |= 1U << ((mmTPC7_CFG_MSS_CONFIG & 0x7F) >> 2);

+84 -18

drivers/misc/habanalabs/goya/goya.c

··· 350 350 GOYA_ASYNC_EVENT_ID_FIX_THERMAL_ENV_E 351 351 }; 352 352 353 + static s64 goya_state_dump_specs_props[SP_MAX] = {0}; 354 + 353 355 static int goya_mmu_clear_pgt_range(struct hl_device *hdev); 354 356 static int goya_mmu_set_dram_default_page(struct hl_device *hdev); 355 357 static int goya_mmu_add_mappings_for_device_cpu(struct hl_device *hdev); ··· 389 387 prop->hw_queues_props[i].cb_alloc_flags = CB_ALLOC_USER; 390 388 } 391 389 390 + prop->device_dma_offset_for_host_access = HOST_PHYS_BASE; 392 391 prop->completion_queues_count = NUMBER_OF_CMPLT_QUEUES; 393 392 394 393 prop->dram_base_address = DRAM_PHYS_BASE; ··· 468 465 prop->fw_cpu_boot_dev_sts1_valid = false; 469 466 prop->hard_reset_done_by_fw = false; 470 467 prop->gic_interrupts_enable = true; 468 + 469 + prop->server_type = HL_SERVER_TYPE_UNKNOWN; 471 470 472 471 return 0; 473 472 } ··· 654 649 GOYA_BOOT_FIT_REQ_TIMEOUT_USEC); 655 650 if (rc) { 656 651 if (hdev->reset_on_preboot_fail) 657 - hdev->asic_funcs->hw_fini(hdev, true); 652 + hdev->asic_funcs->hw_fini(hdev, true, false); 658 653 goto pci_fini; 659 654 } 660 655 661 656 if (goya_get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) { 662 657 dev_info(hdev->dev, 663 658 "H/W state is dirty, must reset before initializing\n"); 664 - hdev->asic_funcs->hw_fini(hdev, true); 659 + hdev->asic_funcs->hw_fini(hdev, true, false); 665 660 } 666 661 667 662 if (!hdev->pldm) { ··· 960 955 hdev->supports_coresight = true; 961 956 hdev->supports_soft_reset = true; 962 957 hdev->allow_external_soft_reset = true; 958 + hdev->supports_wait_for_multi_cs = false; 963 959 964 - goya_set_pci_memory_regions(hdev); 960 + hdev->asic_funcs->set_pci_memory_regions(hdev); 965 961 966 962 return 0; 967 963 ··· 2380 2374 WREG32(mmPSOC_TIMESTAMP_BASE - CFG_BASE, 0); 2381 2375 } 2382 2376 2383 - static void goya_halt_engines(struct hl_device *hdev, bool hard_reset) 2377 + static void goya_halt_engines(struct hl_device *hdev, bool hard_reset, bool fw_reset) 2384 2378 { 2385 2379 u32 wait_timeout_ms; 2386 2380 ··· 2499 2493 struct fw_load_mgr *fw_loader = &hdev->fw_loader; 2500 2494 2501 2495 /* fill common fields */ 2496 + fw_loader->linux_loaded = false; 2502 2497 fw_loader->boot_fit_img.image_name = GOYA_BOOT_FIT_FILE; 2503 2498 fw_loader->linux_img.image_name = GOYA_LINUX_FW_FILE; 2504 2499 fw_loader->cpu_timeout = GOYA_CPU_TIMEOUT_USEC; ··· 2703 2696 return rc; 2704 2697 } 2705 2698 2706 - /* 2707 - * goya_hw_fini - Goya hardware tear-down code 2708 - * 2709 - * @hdev: pointer to hl_device structure 2710 - * @hard_reset: should we do hard reset to all engines or just reset the 2711 - * compute/dma engines 2712 - */ 2713 - static void goya_hw_fini(struct hl_device *hdev, bool hard_reset) 2699 + static void goya_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset) 2714 2700 { 2715 2701 struct goya_device *goya = hdev->asic_specific; 2716 2702 u32 reset_timeout_ms, cpu_timeout_ms, status; ··· 2796 2796 return goya_init_iatu(hdev); 2797 2797 } 2798 2798 2799 - static int goya_cb_mmap(struct hl_device *hdev, struct vm_area_struct *vma, 2799 + static int goya_mmap(struct hl_device *hdev, struct vm_area_struct *vma, 2800 2800 void *cpu_addr, dma_addr_t dma_addr, size_t size) 2801 2801 { 2802 2802 int rc; ··· 4797 4797 >> EQ_CTL_EVENT_TYPE_SHIFT); 4798 4798 struct goya_device *goya = hdev->asic_specific; 4799 4799 4800 + if (event_type >= GOYA_ASYNC_EVENT_ID_SIZE) { 4801 + dev_err(hdev->dev, "Event type %u exceeds maximum of %u", 4802 + event_type, GOYA_ASYNC_EVENT_ID_SIZE - 1); 4803 + return; 4804 + } 4805 + 4800 4806 goya->events_stat[event_type]++; 4801 4807 goya->events_stat_aggregate[event_type]++; 4802 4808 ··· 5481 5475 return device_time | RREG32(mmPSOC_TIMESTAMP_CNTCVL); 5482 5476 } 5483 5477 5484 - static void goya_collective_wait_init_cs(struct hl_cs *cs) 5478 + static int goya_collective_wait_init_cs(struct hl_cs *cs) 5485 5479 { 5486 - 5480 + return 0; 5487 5481 } 5488 5482 5489 5483 static int goya_collective_wait_create_jobs(struct hl_device *hdev, 5490 5484 struct hl_ctx *ctx, struct hl_cs *cs, u32 wait_queue_id, 5491 - u32 collective_engine_id) 5485 + u32 collective_engine_id, u32 encaps_signal_offset) 5492 5486 { 5493 5487 return -EINVAL; 5494 5488 } ··· 5530 5524 } 5531 5525 } 5532 5526 5527 + static int goya_gen_sync_to_engine_map(struct hl_device *hdev, 5528 + struct hl_sync_to_engine_map *map) 5529 + { 5530 + /* Not implemented */ 5531 + return 0; 5532 + } 5533 + 5534 + static int goya_monitor_valid(struct hl_mon_state_dump *mon) 5535 + { 5536 + /* Not implemented */ 5537 + return 0; 5538 + } 5539 + 5540 + static int goya_print_single_monitor(char **buf, size_t *size, size_t *offset, 5541 + struct hl_device *hdev, 5542 + struct hl_mon_state_dump *mon) 5543 + { 5544 + /* Not implemented */ 5545 + return 0; 5546 + } 5547 + 5548 + 5549 + static int goya_print_fences_single_engine( 5550 + struct hl_device *hdev, u64 base_offset, u64 status_base_offset, 5551 + enum hl_sync_engine_type engine_type, u32 engine_id, char **buf, 5552 + size_t *size, size_t *offset) 5553 + { 5554 + /* Not implemented */ 5555 + return 0; 5556 + } 5557 + 5558 + 5559 + static struct hl_state_dump_specs_funcs goya_state_dump_funcs = { 5560 + .monitor_valid = goya_monitor_valid, 5561 + .print_single_monitor = goya_print_single_monitor, 5562 + .gen_sync_to_engine_map = goya_gen_sync_to_engine_map, 5563 + .print_fences_single_engine = goya_print_fences_single_engine, 5564 + }; 5565 + 5566 + static void goya_state_dump_init(struct hl_device *hdev) 5567 + { 5568 + /* Not implemented */ 5569 + hdev->state_dump_specs.props = goya_state_dump_specs_props; 5570 + hdev->state_dump_specs.funcs = goya_state_dump_funcs; 5571 + } 5572 + 5573 + static u32 goya_get_sob_addr(struct hl_device *hdev, u32 sob_id) 5574 + { 5575 + return 0; 5576 + } 5577 + 5578 + static u32 *goya_get_stream_master_qid_arr(void) 5579 + { 5580 + return NULL; 5581 + } 5582 + 5533 5583 static const struct hl_asic_funcs goya_funcs = { 5534 5584 .early_init = goya_early_init, 5535 5585 .early_fini = goya_early_fini, ··· 5598 5536 .halt_engines = goya_halt_engines, 5599 5537 .suspend = goya_suspend, 5600 5538 .resume = goya_resume, 5601 - .cb_mmap = goya_cb_mmap, 5539 + .mmap = goya_mmap, 5602 5540 .ring_doorbell = goya_ring_doorbell, 5603 5541 .pqe_write = goya_pqe_write, 5604 5542 .asic_dma_alloc_coherent = goya_dma_alloc_coherent, ··· 5671 5609 .enable_events_from_fw = goya_enable_events_from_fw, 5672 5610 .map_pll_idx_to_fw_idx = goya_map_pll_idx_to_fw_idx, 5673 5611 .init_firmware_loader = goya_init_firmware_loader, 5674 - .init_cpu_scrambler_dram = goya_cpu_init_scrambler_dram 5612 + .init_cpu_scrambler_dram = goya_cpu_init_scrambler_dram, 5613 + .state_dump_init = goya_state_dump_init, 5614 + .get_sob_addr = &goya_get_sob_addr, 5615 + .set_pci_memory_regions = goya_set_pci_memory_regions, 5616 + .get_stream_master_qid_arr = goya_get_stream_master_qid_arr, 5675 5617 }; 5676 5618 5677 5619 /*

+109 -6

drivers/misc/habanalabs/include/common/cpucp_if.h

··· 98 98 __u8 pad[7]; 99 99 }; 100 100 101 + enum hl_pcie_addr_dec_cause { 102 + PCIE_ADDR_DEC_HBW_ERR_RESP, 103 + PCIE_ADDR_DEC_LBW_ERR_RESP, 104 + PCIE_ADDR_DEC_TLP_BLOCKED_BY_RR 105 + }; 106 + 107 + struct hl_eq_pcie_addr_dec_data { 108 + /* enum hl_pcie_addr_dec_cause */ 109 + __u8 addr_dec_cause; 110 + __u8 pad[7]; 111 + }; 112 + 101 113 struct hl_eq_entry { 102 114 struct hl_eq_header hdr; 103 115 union { ··· 118 106 struct hl_eq_sm_sei_data sm_sei_data; 119 107 struct cpucp_pkt_sync_err pkt_sync_err; 120 108 struct hl_eq_fw_alive fw_alive; 109 + struct hl_eq_pcie_addr_dec_data pcie_addr_dec_data; 121 110 __le64 data[7]; 122 111 }; 123 112 }; ··· 129 116 #define EQ_CTL_READY_MASK 0x80000000 130 117 131 118 #define EQ_CTL_EVENT_TYPE_SHIFT 16 132 - #define EQ_CTL_EVENT_TYPE_MASK 0x03FF0000 119 + #define EQ_CTL_EVENT_TYPE_MASK 0x0FFF0000 133 120 134 121 #define EQ_CTL_INDEX_SHIFT 0 135 122 #define EQ_CTL_INDEX_MASK 0x0000FFFF ··· 313 300 * The packet's arguments specify the desired sensor and the field to 314 301 * set. 315 302 * 316 - * CPUCP_PACKET_PCIE_THROUGHPUT_GET 303 + * CPUCP_PACKET_PCIE_THROUGHPUT_GET - 317 304 * Get throughput of PCIe. 318 305 * The packet's arguments specify the transaction direction (TX/RX). 319 306 * The window measurement is 10[msec], and the return value is in KB/sec. ··· 322 309 * Replay count measures number of "replay" events, which is basicly 323 310 * number of retries done by PCIe. 324 311 * 325 - * CPUCP_PACKET_TOTAL_ENERGY_GET 312 + * CPUCP_PACKET_TOTAL_ENERGY_GET - 326 313 * Total Energy is measurement of energy from the time FW Linux 327 314 * is loaded. It is calculated by multiplying the average power 328 315 * by time (passed from armcp start). The units are in MilliJouls. 329 316 * 330 - * CPUCP_PACKET_PLL_INFO_GET 317 + * CPUCP_PACKET_PLL_INFO_GET - 331 318 * Fetch frequencies of PLL from the required PLL IP. 332 319 * The packet's arguments specify the device PLL type 333 320 * Pll type is the PLL from device pll_index enum. 334 321 * The result is composed of 4 outputs, each is 16-bit 335 322 * frequency in MHz. 336 323 * 337 - * CPUCP_PACKET_POWER_GET 324 + * CPUCP_PACKET_POWER_GET - 338 325 * Fetch the present power consumption of the device (Current * Voltage). 339 326 * 340 327 * CPUCP_PACKET_NIC_PFC_SET - ··· 358 345 * CPUCP_PACKET_MSI_INFO_SET - 359 346 * set the index number for each supported msi type going from 360 347 * host to device 348 + * 349 + * CPUCP_PACKET_NIC_XPCS91_REGS_GET - 350 + * Fetch the un/correctable counters values from the NIC MAC. 351 + * 352 + * CPUCP_PACKET_NIC_STAT_REGS_GET - 353 + * Fetch various NIC MAC counters from the NIC STAT. 354 + * 355 + * CPUCP_PACKET_NIC_STAT_REGS_CLR - 356 + * Clear the various NIC MAC counters in the NIC STAT. 357 + * 358 + * CPUCP_PACKET_NIC_STAT_REGS_ALL_GET - 359 + * Fetch all NIC MAC counters from the NIC STAT. 360 + * 361 + * CPUCP_PACKET_IS_IDLE_CHECK - 362 + * Check if the device is IDLE in regard to the DMA/compute engines 363 + * and QMANs. The f/w will return a bitmask where each bit represents 364 + * a different engine or QMAN according to enum cpucp_idle_mask. 365 + * The bit will be 1 if the engine is NOT idle. 361 366 */ 362 367 363 368 enum cpucp_packet_id { ··· 416 385 CPUCP_PACKET_NIC_LPBK_SET, /* internal */ 417 386 CPUCP_PACKET_NIC_MAC_CFG, /* internal */ 418 387 CPUCP_PACKET_MSI_INFO_SET, /* internal */ 388 + CPUCP_PACKET_NIC_XPCS91_REGS_GET, /* internal */ 389 + CPUCP_PACKET_NIC_STAT_REGS_GET, /* internal */ 390 + CPUCP_PACKET_NIC_STAT_REGS_CLR, /* internal */ 391 + CPUCP_PACKET_NIC_STAT_REGS_ALL_GET, /* internal */ 392 + CPUCP_PACKET_IS_IDLE_CHECK, /* internal */ 419 393 }; 420 394 421 395 #define CPUCP_PACKET_FENCE_VAL 0xFE8CE7A5 ··· 449 413 #define CPUCP_PKT_VAL_LPBK_IN1_MASK 0x0000000000000001ull 450 414 #define CPUCP_PKT_VAL_LPBK_IN2_SHIFT 1 451 415 #define CPUCP_PKT_VAL_LPBK_IN2_MASK 0x000000000000001Eull 416 + 417 + #define CPUCP_PKT_VAL_MAC_CNT_IN1_SHIFT 0 418 + #define CPUCP_PKT_VAL_MAC_CNT_IN1_MASK 0x0000000000000001ull 419 + #define CPUCP_PKT_VAL_MAC_CNT_IN2_SHIFT 1 420 + #define CPUCP_PKT_VAL_MAC_CNT_IN2_MASK 0x00000000FFFFFFFEull 452 421 453 422 /* heartbeat status bits */ 454 423 #define CPUCP_PKT_HB_STATUS_EQ_FAULT_SHIFT 0 ··· 508 467 __le32 status_mask; 509 468 }; 510 469 511 - __le32 reserved; 470 + /* For NIC requests */ 471 + __le32 port_index; 512 472 }; 513 473 514 474 struct cpucp_unmask_irq_arr_packet { 515 475 struct cpucp_packet cpucp_pkt; 516 476 __le32 length; 517 477 __le32 irqs[0]; 478 + }; 479 + 480 + struct cpucp_nic_status_packet { 481 + struct cpucp_packet cpucp_pkt; 482 + __le32 length; 483 + __le32 data[0]; 518 484 }; 519 485 520 486 struct cpucp_array_data_packet { ··· 643 595 PLL_MAX 644 596 }; 645 597 598 + enum rl_index { 599 + TPC_RL = 0, 600 + MME_RL, 601 + }; 602 + 603 + enum pvt_index { 604 + PVT_SW, 605 + PVT_SE, 606 + PVT_NW, 607 + PVT_NE 608 + }; 609 + 646 610 /* Event Queue Packets */ 647 611 648 612 struct eq_generic_event { ··· 760 700 __u8 mac_addr[ETH_ALEN]; 761 701 }; 762 702 703 + enum cpucp_serdes_type { 704 + TYPE_1_SERDES_TYPE, 705 + TYPE_2_SERDES_TYPE, 706 + HLS1_SERDES_TYPE, 707 + HLS1H_SERDES_TYPE, 708 + UNKNOWN_SERDES_TYPE, 709 + MAX_NUM_SERDES_TYPE = UNKNOWN_SERDES_TYPE 710 + }; 711 + 763 712 struct cpucp_nic_info { 764 713 struct cpucp_mac_addr mac_addrs[CPUCP_MAX_NICS]; 765 714 __le64 link_mask[CPUCP_NIC_MASK_ARR_LEN]; ··· 777 708 __le64 link_ext_mask[CPUCP_NIC_MASK_ARR_LEN]; 778 709 __u8 qsfp_eeprom[CPUCP_NIC_QSFP_EEPROM_MAX_LEN]; 779 710 __le64 auto_neg_mask[CPUCP_NIC_MASK_ARR_LEN]; 711 + __le16 serdes_type; /* enum cpucp_serdes_type */ 712 + __u8 reserved[6]; 713 + }; 714 + 715 + /* 716 + * struct cpucp_nic_status - describes the status of a NIC port. 717 + * @port: NIC port index. 718 + * @bad_format_cnt: e.g. CRC. 719 + * @responder_out_of_sequence_psn_cnt: e.g NAK. 720 + * @high_ber_reinit_cnt: link reinit due to high BER. 721 + * @correctable_err_cnt: e.g. bit-flip. 722 + * @uncorrectable_err_cnt: e.g. MAC errors. 723 + * @retraining_cnt: re-training counter. 724 + * @up: is port up. 725 + * @pcs_link: has PCS link. 726 + * @phy_ready: is PHY ready. 727 + * @auto_neg: is Autoneg enabled. 728 + * @timeout_retransmission_cnt: timeout retransmission events 729 + * @high_ber_cnt: high ber events 730 + */ 731 + struct cpucp_nic_status { 732 + __le32 port; 733 + __le32 bad_format_cnt; 734 + __le32 responder_out_of_sequence_psn_cnt; 735 + __le32 high_ber_reinit; 736 + __le32 correctable_err_cnt; 737 + __le32 uncorrectable_err_cnt; 738 + __le32 retraining_cnt; 739 + __u8 up; 740 + __u8 pcs_link; 741 + __u8 phy_ready; 742 + __u8 auto_neg; 743 + __le32 timeout_retransmission_cnt; 744 + __le32 high_ber_cnt; 780 745 }; 781 746 782 747 #endif /* CPUCP_IF_H */

+57 -5

drivers/misc/habanalabs/include/common/hl_boot_if.h

··· 78 78 * CPU_BOOT_ERR0_DEVICE_UNUSABLE_FAIL Device is unusable and customer support 79 79 * should be contacted. 80 80 * 81 + * CPU_BOOT_ERR0_ARC0_HALT_ACK_NOT_RCVD HALT ACK from ARC0 is not received 82 + * within specified retries after issuing 83 + * HALT request. ARC0 appears to be in bad 84 + * reset. 85 + * 86 + * CPU_BOOT_ERR0_ARC1_HALT_ACK_NOT_RCVD HALT ACK from ARC1 is not received 87 + * within specified retries after issuing 88 + * HALT request. ARC1 appears to be in bad 89 + * reset. 90 + * 91 + * CPU_BOOT_ERR0_ARC0_RUN_ACK_NOT_RCVD RUN ACK from ARC0 is not received 92 + * within specified timeout after issuing 93 + * RUN request. ARC0 appears to be in bad 94 + * reset. 95 + * 96 + * CPU_BOOT_ERR0_ARC1_RUN_ACK_NOT_RCVD RUN ACK from ARC1 is not received 97 + * within specified timeout after issuing 98 + * RUN request. ARC1 appears to be in bad 99 + * reset. 100 + * 81 101 * CPU_BOOT_ERR0_ENABLED Error registers enabled. 82 102 * This is a main indication that the 83 103 * running FW populates the error ··· 118 98 #define CPU_BOOT_ERR0_SEC_IMG_VER_FAIL (1 << 11) 119 99 #define CPU_BOOT_ERR0_PLL_FAIL (1 << 12) 120 100 #define CPU_BOOT_ERR0_DEVICE_UNUSABLE_FAIL (1 << 13) 101 + #define CPU_BOOT_ERR0_ARC0_HALT_ACK_NOT_RCVD (1 << 14) 102 + #define CPU_BOOT_ERR0_ARC1_HALT_ACK_NOT_RCVD (1 << 15) 103 + #define CPU_BOOT_ERR0_ARC0_RUN_ACK_NOT_RCVD (1 << 16) 104 + #define CPU_BOOT_ERR0_ARC1_RUN_ACK_NOT_RCVD (1 << 17) 121 105 #define CPU_BOOT_ERR0_ENABLED (1 << 31) 122 106 #define CPU_BOOT_ERR1_ENABLED (1 << 31) 123 107 ··· 210 186 * configured and is ready for use. 211 187 * Initialized in: ppboot 212 188 * 189 + * CPU_BOOT_DEV_STS0_FW_NIC_MAC_EN NIC MAC channels init is done by FW and 190 + * any access to them is done via the FW. 191 + * Initialized in: linux 192 + * 213 193 * CPU_BOOT_DEV_STS0_DYN_PLL_EN Dynamic PLL configuration is enabled. 214 194 * FW sends to host a bitmap of supported 215 195 * PLLs. ··· 235 207 * 236 208 * CPU_BOOT_DEV_STS0_MULTI_IRQ_POLL_EN Use multiple scratchpad interfaces to 237 209 * prevent IRQs overriding each other. 210 + * Initialized in: linux 211 + * 212 + * CPU_BOOT_DEV_STS0_FW_NIC_STAT_XPCS91_EN 213 + * NIC STAT and XPCS91 access is restricted 214 + * and is done via FW only. 215 + * Initialized in: linux 216 + * 217 + * CPU_BOOT_DEV_STS0_FW_NIC_STAT_EXT_EN 218 + * NIC STAT get all is supported. 219 + * Initialized in: linux 220 + * 221 + * CPU_BOOT_DEV_STS0_IS_IDLE_CHECK_EN 222 + * F/W checks if the device is idle by reading defined set 223 + * of registers. It returns a bitmask of all the engines, 224 + * where a bit is set if the engine is not idle. 238 225 * Initialized in: linux 239 226 * 240 227 * CPU_BOOT_DEV_STS0_ENABLED Device status register enabled. ··· 279 236 #define CPU_BOOT_DEV_STS0_PKT_PI_ACK_EN (1 << 15) 280 237 #define CPU_BOOT_DEV_STS0_FW_LD_COM_EN (1 << 16) 281 238 #define CPU_BOOT_DEV_STS0_FW_IATU_CONF_EN (1 << 17) 239 + #define CPU_BOOT_DEV_STS0_FW_NIC_MAC_EN (1 << 18) 282 240 #define CPU_BOOT_DEV_STS0_DYN_PLL_EN (1 << 19) 283 241 #define CPU_BOOT_DEV_STS0_GIC_PRIVILEGED_EN (1 << 20) 284 242 #define CPU_BOOT_DEV_STS0_EQ_INDEX_EN (1 << 21) 285 243 #define CPU_BOOT_DEV_STS0_MULTI_IRQ_POLL_EN (1 << 22) 244 + #define CPU_BOOT_DEV_STS0_FW_NIC_STAT_XPCS91_EN (1 << 23) 245 + #define CPU_BOOT_DEV_STS0_FW_NIC_STAT_EXT_EN (1 << 24) 246 + #define CPU_BOOT_DEV_STS0_IS_IDLE_CHECK_EN (1 << 25) 286 247 #define CPU_BOOT_DEV_STS0_ENABLED (1 << 31) 287 248 #define CPU_BOOT_DEV_STS1_ENABLED (1 << 31) 288 249 ··· 360 313 __le32 hw_state; 361 314 __le32 kmd_msg_to_cpu; 362 315 __le32 cpu_cmd_status_to_host; 363 - union { 364 - __le32 gic_host_irq_ctrl; 365 - __le32 gic_host_pi_upd_irq; 366 - }; 316 + __le32 gic_host_pi_upd_irq; 367 317 __le32 gic_tpc_qm_irq_ctrl; 368 318 __le32 gic_mme_qm_irq_ctrl; 369 319 __le32 gic_dma_qm_irq_ctrl; ··· 368 324 __le32 gic_dma_core_irq_ctrl; 369 325 __le32 gic_host_halt_irq; 370 326 __le32 gic_host_ints_irq; 371 - __le32 reserved1[24]; /* reserve for future use */ 327 + __le32 gic_host_soft_rst_irq; 328 + __le32 gic_rot_qm_irq_ctrl; 329 + __le32 reserved1[22]; /* reserve for future use */ 372 330 }; 373 331 374 332 /* TODO: remove the desc magic after the code is updated to use message */ ··· 508 462 * Do not wait for BMC response. 509 463 * 510 464 * COMMS_LOW_PLL_OPP Initialize PLLs for low OPP. 465 + * 466 + * COMMS_PREP_DESC_ELBI Same as COMMS_PREP_DESC only that the memory 467 + * space is allocated in a ELBI access only 468 + * address range. 469 + * 511 470 */ 512 471 enum comms_cmd { 513 472 COMMS_NOOP = 0, ··· 525 474 COMMS_GOTO_WFE = 7, 526 475 COMMS_SKIP_BMC = 8, 527 476 COMMS_LOW_PLL_OPP = 9, 477 + COMMS_PREP_DESC_ELBI = 10, 528 478 COMMS_INVLD_LAST 529 479 }; 530 480

+3

drivers/misc/habanalabs/include/gaudi/asic_reg/gaudi_regs.h

··· 126 126 #define mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_SOB_OBJ_1 0x4F2004 127 127 #define mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_SOB_OBJ_2047 0x4F3FFC 128 128 #define mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0 0x4F4000 129 + #define mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_ADDRH_0 0x4F4800 130 + #define mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_DATA_0 0x4F5000 131 + #define mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_ARM_0 0x4F5800 129 132 #define mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_STATUS_0 0x4F6000 130 133 #define mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_STATUS_511 0x4F67FC 131 134

+17

drivers/misc/habanalabs/include/gaudi/gaudi_masks.h

··· 449 449 #define PCIE_AUX_FLR_CTRL_HW_CTRL_MASK 0x1 450 450 #define PCIE_AUX_FLR_CTRL_INT_MASK_MASK 0x2 451 451 452 + #define SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_STATUS_0_VALID_SHIFT 0 453 + #define SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_STATUS_0_VALID_MASK 0x1 454 + #define SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_STATUS_0_PENDING_SHIFT 1 455 + #define SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_STATUS_0_PENDING_MASK 0x1FE 456 + #define SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_SID_SHIFT 0 457 + #define SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_SID_MASK 0xFF 458 + #define SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_MASK_SHIFT 8 459 + #define SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_MASK_MASK 0xFF00 460 + #define SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_SOP_SHIFT 16 461 + #define SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_SOP_MASK 0x10000 462 + #define SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_SOD_SHIFT 17 463 + #define SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_SOD_MASK 0xFFFE0000 464 + #define TPC0_QM_CP_STS_0_FENCE_ID_SHIFT 20 465 + #define TPC0_QM_CP_STS_0_FENCE_ID_MASK 0x300000 466 + #define TPC0_QM_CP_STS_0_FENCE_IN_PROGRESS_SHIFT 22 467 + #define TPC0_QM_CP_STS_0_FENCE_IN_PROGRESS_MASK 0x400000 468 + 452 469 #endif /* GAUDI_MASKS_H_ */

-2

drivers/misc/habanalabs/include/gaudi/gaudi_reg_map.h

··· 12 12 * PSOC scratch-pad registers 13 13 */ 14 14 #define mmHW_STATE mmPSOC_GLOBAL_CONF_SCRATCHPAD_0 15 - /* TODO: remove mmGIC_HOST_IRQ_CTRL_POLL_REG */ 16 - #define mmGIC_HOST_IRQ_CTRL_POLL_REG mmPSOC_GLOBAL_CONF_SCRATCHPAD_1 17 15 #define mmGIC_HOST_PI_UPD_IRQ_POLL_REG mmPSOC_GLOBAL_CONF_SCRATCHPAD_1 18 16 #define mmGIC_TPC_QM_IRQ_CTRL_POLL_REG mmPSOC_GLOBAL_CONF_SCRATCHPAD_2 19 17 #define mmGIC_MME_QM_IRQ_CTRL_POLL_REG mmPSOC_GLOBAL_CONF_SCRATCHPAD_3

+167 -21

include/uapi/misc/habanalabs.h

··· 276 276 HL_DEVICE_STATUS_OPERATIONAL, 277 277 HL_DEVICE_STATUS_IN_RESET, 278 278 HL_DEVICE_STATUS_MALFUNCTION, 279 - HL_DEVICE_STATUS_NEEDS_RESET 279 + HL_DEVICE_STATUS_NEEDS_RESET, 280 + HL_DEVICE_STATUS_IN_DEVICE_CREATION, 281 + HL_DEVICE_STATUS_LAST = HL_DEVICE_STATUS_IN_DEVICE_CREATION 282 + }; 283 + 284 + enum hl_server_type { 285 + HL_SERVER_TYPE_UNKNOWN = 0, 286 + HL_SERVER_GAUDI_HLS1 = 1, 287 + HL_SERVER_GAUDI_HLS1H = 2, 288 + HL_SERVER_GAUDI_TYPE1 = 3, 289 + HL_SERVER_GAUDI_TYPE2 = 4 280 290 }; 281 291 282 292 /* Opcode for management ioctl ··· 347 337 #define HL_INFO_VERSION_MAX_LEN 128 348 338 #define HL_INFO_CARD_NAME_MAX_LEN 16 349 339 340 + /** 341 + * struct hl_info_hw_ip_info - hardware information on various IPs in the ASIC 342 + * @sram_base_address: The first SRAM physical base address that is free to be 343 + * used by the user. 344 + * @dram_base_address: The first DRAM virtual or physical base address that is 345 + * free to be used by the user. 346 + * @dram_size: The DRAM size that is available to the user. 347 + * @sram_size: The SRAM size that is available to the user. 348 + * @num_of_events: The number of events that can be received from the f/w. This 349 + * is needed so the user can what is the size of the h/w events 350 + * array he needs to pass to the kernel when he wants to fetch 351 + * the event counters. 352 + * @device_id: PCI device ID of the ASIC. 353 + * @module_id: Module ID of the ASIC for mezzanine cards in servers 354 + * (From OCP spec). 355 + * @first_available_interrupt_id: The first available interrupt ID for the user 356 + * to be used when it works with user interrupts. 357 + * @server_type: Server type that the Gaudi ASIC is currently installed in. 358 + * The value is according to enum hl_server_type 359 + * @cpld_version: CPLD version on the board. 360 + * @psoc_pci_pll_nr: PCI PLL NR value. Needed by the profiler in some ASICs. 361 + * @psoc_pci_pll_nf: PCI PLL NF value. Needed by the profiler in some ASICs. 362 + * @psoc_pci_pll_od: PCI PLL OD value. Needed by the profiler in some ASICs. 363 + * @psoc_pci_pll_div_factor: PCI PLL DIV factor value. Needed by the profiler 364 + * in some ASICs. 365 + * @tpc_enabled_mask: Bit-mask that represents which TPCs are enabled. Relevant 366 + * for Goya/Gaudi only. 367 + * @dram_enabled: Whether the DRAM is enabled. 368 + * @cpucp_version: The CPUCP f/w version. 369 + * @card_name: The card name as passed by the f/w. 370 + * @dram_page_size: The DRAM physical page size. 371 + */ 350 372 struct hl_info_hw_ip_info { 351 373 __u64 sram_base_address; 352 374 __u64 dram_base_address; 353 375 __u64 dram_size; 354 376 __u32 sram_size; 355 377 __u32 num_of_events; 356 - __u32 device_id; /* PCI Device ID */ 357 - __u32 module_id; /* For mezzanine cards in servers (From OCP spec.) */ 378 + __u32 device_id; 379 + __u32 module_id; 358 380 __u32 reserved; 359 381 __u16 first_available_interrupt_id; 360 - __u16 reserved2; 382 + __u16 server_type; 361 383 __u32 cpld_version; 362 384 __u32 psoc_pci_pll_nr; 363 385 __u32 psoc_pci_pll_nf; ··· 400 358 __u8 pad[2]; 401 359 __u8 cpucp_version[HL_INFO_VERSION_MAX_LEN]; 402 360 __u8 card_name[HL_INFO_CARD_NAME_MAX_LEN]; 403 - __u64 reserved3; 361 + __u64 reserved2; 404 362 __u64 dram_page_size; 405 363 }; 406 364 ··· 670 628 __u64 cb_handle; 671 629 672 630 /* Relevant only when HL_CS_FLAGS_WAIT or 673 - * HL_CS_FLAGS_COLLECTIVE_WAIT is set. 631 + * HL_CS_FLAGS_COLLECTIVE_WAIT is set 674 632 * This holds address of array of u64 values that contain 675 - * signal CS sequence numbers. The wait described by this job 676 - * will listen on all those signals (wait event per signal) 633 + * signal CS sequence numbers. The wait described by 634 + * this job will listen on all those signals 635 + * (wait event per signal) 677 636 */ 678 637 __u64 signal_seq_arr; 638 + 639 + /* 640 + * Relevant only when HL_CS_FLAGS_WAIT or 641 + * HL_CS_FLAGS_COLLECTIVE_WAIT is set 642 + * along with HL_CS_FLAGS_ENCAP_SIGNALS. 643 + * This is the CS sequence which has the encapsulated signals. 644 + */ 645 + __u64 encaps_signal_seq; 679 646 }; 680 647 681 648 /* Index of queue to put the CB on */ ··· 702 651 * Number of entries in signal_seq_arr 703 652 */ 704 653 __u32 num_signal_seq_arr; 654 + 655 + /* Relevant only when HL_CS_FLAGS_WAIT or 656 + * HL_CS_FLAGS_COLLECTIVE_WAIT is set along 657 + * with HL_CS_FLAGS_ENCAP_SIGNALS 658 + * This set the signals range that the user want to wait for 659 + * out of the whole reserved signals range. 660 + * e.g if the signals range is 20, and user don't want 661 + * to wait for signal 8, so he set this offset to 7, then 662 + * he call the API again with 9 and so on till 20. 663 + */ 664 + __u32 encaps_signal_offset; 705 665 }; 706 666 707 667 /* HL_CS_CHUNK_FLAGS_* */ ··· 740 678 #define HL_CS_FLAGS_CUSTOM_TIMEOUT 0x200 741 679 #define HL_CS_FLAGS_SKIP_RESET_ON_TIMEOUT 0x400 742 680 681 + /* 682 + * The encapsulated signals CS is merged into the existing CS ioctls. 683 + * In order to use this feature need to follow the below procedure: 684 + * 1. Reserve signals, set the CS type to HL_CS_FLAGS_RESERVE_SIGNALS_ONLY 685 + * the output of this API will be the SOB offset from CFG_BASE. 686 + * this address will be used to patch CB cmds to do the signaling for this 687 + * SOB by incrementing it's value. 688 + * for reverting the reservation use HL_CS_FLAGS_UNRESERVE_SIGNALS_ONLY 689 + * CS type, note that this might fail if out-of-sync happened to the SOB 690 + * value, in case other signaling request to the same SOB occurred between 691 + * reserve-unreserve calls. 692 + * 2. Use the staged CS to do the encapsulated signaling jobs. 693 + * use HL_CS_FLAGS_STAGED_SUBMISSION and HL_CS_FLAGS_STAGED_SUBMISSION_FIRST 694 + * along with HL_CS_FLAGS_ENCAP_SIGNALS flag, and set encaps_signal_offset 695 + * field. This offset allows app to wait on part of the reserved signals. 696 + * 3. Use WAIT/COLLECTIVE WAIT CS along with HL_CS_FLAGS_ENCAP_SIGNALS flag 697 + * to wait for the encapsulated signals. 698 + */ 699 + #define HL_CS_FLAGS_ENCAP_SIGNALS 0x800 700 + #define HL_CS_FLAGS_RESERVE_SIGNALS_ONLY 0x1000 701 + #define HL_CS_FLAGS_UNRESERVE_SIGNALS_ONLY 0x2000 702 + 743 703 #define HL_CS_STATUS_SUCCESS 0 744 704 745 705 #define HL_MAX_JOBS_PER_CS 512 ··· 774 690 /* holds address of array of hl_cs_chunk for execution phase */ 775 691 __u64 chunks_execute; 776 692 777 - /* Sequence number of a staged submission CS 778 - * valid only if HL_CS_FLAGS_STAGED_SUBMISSION is set 779 - */ 780 - __u64 seq; 693 + union { 694 + /* 695 + * Sequence number of a staged submission CS 696 + * valid only if HL_CS_FLAGS_STAGED_SUBMISSION is set and 697 + * HL_CS_FLAGS_STAGED_SUBMISSION_FIRST is unset. 698 + */ 699 + __u64 seq; 700 + 701 + /* 702 + * Encapsulated signals handle id 703 + * Valid for two flows: 704 + * 1. CS with encapsulated signals: 705 + * when HL_CS_FLAGS_STAGED_SUBMISSION and 706 + * HL_CS_FLAGS_STAGED_SUBMISSION_FIRST 707 + * and HL_CS_FLAGS_ENCAP_SIGNALS are set. 708 + * 2. unreserve signals: 709 + * valid when HL_CS_FLAGS_UNRESERVE_SIGNALS_ONLY is set. 710 + */ 711 + __u32 encaps_sig_handle_id; 712 + 713 + /* Valid only when HL_CS_FLAGS_RESERVE_SIGNALS_ONLY is set */ 714 + struct { 715 + /* Encapsulated signals number */ 716 + __u32 encaps_signals_count; 717 + 718 + /* Encapsulated signals queue index (stream) */ 719 + __u32 encaps_signals_q_idx; 720 + }; 721 + }; 781 722 782 723 /* Number of chunks in restore phase array. Maximum number is 783 724 * HL_MAX_JOBS_PER_CS ··· 827 718 }; 828 719 829 720 struct hl_cs_out { 830 - /* 831 - * seq holds the sequence number of the CS to pass to wait ioctl. All 832 - * values are valid except for 0 and ULLONG_MAX 833 - */ 834 - __u64 seq; 835 - /* HL_CS_STATUS_* */ 721 + union { 722 + /* 723 + * seq holds the sequence number of the CS to pass to wait 724 + * ioctl. All values are valid except for 0 and ULLONG_MAX 725 + */ 726 + __u64 seq; 727 + 728 + /* Valid only when HL_CS_FLAGS_RESERVE_SIGNALS_ONLY is set */ 729 + struct { 730 + /* This is the resereved signal handle id */ 731 + __u32 handle_id; 732 + 733 + /* This is the signals count */ 734 + __u32 count; 735 + }; 736 + }; 737 + 738 + /* HL_CS_STATUS */ 836 739 __u32 status; 837 - __u32 pad; 740 + 741 + /* 742 + * SOB base address offset 743 + * Valid only when HL_CS_FLAGS_RESERVE_SIGNALS_ONLY is set 744 + */ 745 + __u32 sob_base_addr_offset; 838 746 }; 839 747 840 748 union hl_cs_args { ··· 861 735 862 736 #define HL_WAIT_CS_FLAGS_INTERRUPT 0x2 863 737 #define HL_WAIT_CS_FLAGS_INTERRUPT_MASK 0xFFF00000 738 + #define HL_WAIT_CS_FLAGS_MULTI_CS 0x4 739 + 740 + #define HL_WAIT_MULTI_CS_LIST_MAX_LEN 32 864 741 865 742 struct hl_wait_cs_in { 866 743 union { 867 744 struct { 868 - /* Command submission sequence number */ 745 + /* 746 + * In case of wait_cs holds the CS sequence number. 747 + * In case of wait for multi CS hold a user pointer to 748 + * an array of CS sequence numbers 749 + */ 869 750 __u64 seq; 870 751 /* Absolute timeout to wait for command submission 871 752 * in microseconds ··· 900 767 901 768 /* Context ID - Currently not in use */ 902 769 __u32 ctx_id; 770 + 903 771 /* HL_WAIT_CS_FLAGS_* 904 772 * If HL_WAIT_CS_FLAGS_INTERRUPT is set, this field should include 905 773 * interrupt id according to HL_WAIT_CS_FLAGS_INTERRUPT_MASK, in order 906 774 * not to specify an interrupt id ,set mask to all 1s. 907 775 */ 908 776 __u32 flags; 777 + 778 + /* Multi CS API info- valid entries in multi-CS array */ 779 + __u8 seq_arr_len; 780 + __u8 pad[7]; 909 781 }; 910 782 911 783 #define HL_WAIT_CS_STATUS_COMPLETED 0 ··· 927 789 __u32 status; 928 790 /* HL_WAIT_CS_STATUS_FLAG* */ 929 791 __u32 flags; 930 - /* valid only if HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD is set */ 792 + /* 793 + * valid only if HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD is set 794 + * for wait_cs: timestamp of CS completion 795 + * for wait_multi_cs: timestamp of FIRST CS completion 796 + */ 931 797 __s64 timestamp_nsec; 798 + /* multi CS completion bitmap */ 799 + __u32 cs_completion_map; 800 + __u32 pad; 932 801 }; 933 802 934 803 union hl_wait_cs_args { ··· 958 813 #define HL_MEM_CONTIGUOUS 0x1 959 814 #define HL_MEM_SHARED 0x2 960 815 #define HL_MEM_USERPTR 0x4 816 + #define HL_MEM_FORCE_HINT 0x8 961 817 962 818 struct hl_mem_in { 963 819 union {