Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

accel/habanalabs/gaudi2: add support for logging register accesses from debugfs

Add infrastructure for logging the last configuration register accesses
that occur via debugfs read/write operations. At interrupt time, these
log entries can be dumped to dmesg, which helps in diagnosing the cause
of RAZWI and ADDR_DEC interrupts.

The logging is implemented as a ring buffer of access entries, with each
entry recording timestamp and access details. To ensure correctness
under concurrent access, operations are now protected using spinlocks.
Entries are copied under lock and then printed after releasing it, which
minimizes time spent in the critical section.

Signed-off-by: Sharley Calzolari <sharley.calzolari@intel.com>
Reviewed-by: Koby Elbaz <koby.elbaz@intel.com>
Signed-off-by: Koby Elbaz <koby.elbaz@intel.com>

authored by

Sharley Calzolari and committed by
Koby Elbaz
b5cddeb0 214e26a4

+148 -1
+111
drivers/accel/habanalabs/common/debugfs.c
··· 788 788 } 789 789 } 790 790 791 + static void dump_cfg_access_entry(struct hl_device *hdev, 792 + struct hl_debugfs_cfg_access_entry *entry) 793 + { 794 + char *access_type = ""; 795 + struct tm tm; 796 + 797 + switch (entry->debugfs_type) { 798 + case DEBUGFS_READ32: 799 + access_type = "READ32 from"; 800 + break; 801 + case DEBUGFS_WRITE32: 802 + access_type = "WRITE32 to"; 803 + break; 804 + case DEBUGFS_READ64: 805 + access_type = "READ64 from"; 806 + break; 807 + case DEBUGFS_WRITE64: 808 + access_type = "WRITE64 to"; 809 + break; 810 + default: 811 + dev_err(hdev->dev, "Invalid DEBUGFS access type (%u)\n", entry->debugfs_type); 812 + return; 813 + } 814 + 815 + time64_to_tm(entry->seconds_since_epoch, 0, &tm); 816 + dev_info(hdev->dev, 817 + "%ld-%02d-%02d %02d:%02d:%02d (UTC): %s %#llx\n", tm.tm_year + 1900, tm.tm_mon + 1, 818 + tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec, access_type, entry->addr); 819 + } 820 + 821 + void hl_debugfs_cfg_access_history_dump(struct hl_device *hdev) 822 + { 823 + struct hl_debugfs_cfg_access *dbgfs = &hdev->debugfs_cfg_accesses; 824 + u32 i, head, count = 0; 825 + time64_t entry_time, now; 826 + unsigned long flags; 827 + 828 + now = ktime_get_real_seconds(); 829 + 830 + spin_lock_irqsave(&dbgfs->lock, flags); 831 + head = dbgfs->head; 832 + if (head == 0) 833 + i = HL_DBGFS_CFG_ACCESS_HIST_LEN - 1; 834 + else 835 + i = head - 1; 836 + 837 + /* Walk back until timeout or invalid entry */ 838 + while (dbgfs->cfg_access_list[i].valid) { 839 + entry_time = dbgfs->cfg_access_list[i].seconds_since_epoch; 840 + /* Stop when entry is older than timeout */ 841 + if (now - entry_time > HL_DBGFS_CFG_ACCESS_HIST_TIMEOUT_SEC) 842 + break; 843 + 844 + /* print single entry under lock */ 845 + { 846 + struct hl_debugfs_cfg_access_entry entry = dbgfs->cfg_access_list[i]; 847 + /* 848 + * We copy the entry out under lock and then print after 849 + * releasing the lock to minimize time under lock. 850 + */ 851 + spin_unlock_irqrestore(&dbgfs->lock, flags); 852 + dump_cfg_access_entry(hdev, &entry); 853 + spin_lock_irqsave(&dbgfs->lock, flags); 854 + } 855 + 856 + /* mark consumed */ 857 + dbgfs->cfg_access_list[i].valid = false; 858 + 859 + if (i == 0) 860 + i = HL_DBGFS_CFG_ACCESS_HIST_LEN - 1; 861 + else 862 + i--; 863 + count++; 864 + if (count >= HL_DBGFS_CFG_ACCESS_HIST_LEN) 865 + break; 866 + } 867 + spin_unlock_irqrestore(&dbgfs->lock, flags); 868 + } 869 + 870 + static void check_if_cfg_access_and_log(struct hl_device *hdev, u64 addr, size_t access_size, 871 + enum debugfs_access_type access_type) 872 + { 873 + struct hl_debugfs_cfg_access *dbgfs_cfg_accesses = &hdev->debugfs_cfg_accesses; 874 + struct pci_mem_region *mem_reg = &hdev->pci_mem_region[PCI_REGION_CFG]; 875 + struct hl_debugfs_cfg_access_entry *new_entry; 876 + unsigned long flags; 877 + 878 + /* Check if address is in config memory */ 879 + if (addr >= mem_reg->region_base && 880 + mem_reg->region_size >= access_size && 881 + addr <= mem_reg->region_base + mem_reg->region_size - access_size) { 882 + 883 + spin_lock_irqsave(&dbgfs_cfg_accesses->lock, flags); 884 + 885 + new_entry = &dbgfs_cfg_accesses->cfg_access_list[dbgfs_cfg_accesses->head]; 886 + new_entry->seconds_since_epoch = ktime_get_real_seconds(); 887 + new_entry->addr = addr; 888 + new_entry->debugfs_type = access_type; 889 + new_entry->valid = true; 890 + dbgfs_cfg_accesses->head = (dbgfs_cfg_accesses->head + 1) 891 + % HL_DBGFS_CFG_ACCESS_HIST_LEN; 892 + 893 + spin_unlock_irqrestore(&dbgfs_cfg_accesses->lock, flags); 894 + 895 + } 896 + } 897 + 791 898 static int hl_access_mem(struct hl_device *hdev, u64 addr, u64 *val, 792 899 enum debugfs_access_type acc_type) 793 900 { ··· 912 805 return rc; 913 806 } 914 807 808 + check_if_cfg_access_and_log(hdev, addr, acc_size, acc_type); 915 809 rc = hl_access_dev_mem_by_region(hdev, addr, val, acc_type, &found); 916 810 if (rc) { 917 811 dev_err(hdev->dev, ··· 1869 1761 spin_lock_init(&dev_entry->cs_job_spinlock); 1870 1762 spin_lock_init(&dev_entry->userptr_spinlock); 1871 1763 mutex_init(&dev_entry->ctx_mem_hash_mutex); 1764 + 1765 + spin_lock_init(&hdev->debugfs_cfg_accesses.lock); 1766 + hdev->debugfs_cfg_accesses.head = 0; /* already zero by alloc but explicit init is fine */ 1872 1767 1873 1768 return 0; 1874 1769 }
+36 -1
drivers/accel/habanalabs/common/habanalabs.h
··· 90 90 #define HL_COMMON_USER_CQ_INTERRUPT_ID 0xFFF 91 91 #define HL_COMMON_DEC_INTERRUPT_ID 0xFFE 92 92 93 - #define HL_STATE_DUMP_HIST_LEN 5 93 + #define HL_STATE_DUMP_HIST_LEN 5 94 + #define HL_DBGFS_CFG_ACCESS_HIST_LEN 20 95 + #define HL_DBGFS_CFG_ACCESS_HIST_TIMEOUT_SEC 2 /* 2s */ 94 96 95 97 /* Default value for device reset trigger , an invalid value */ 96 98 #define HL_RESET_TRIGGER_DEFAULT 0xFF ··· 2439 2437 }; 2440 2438 2441 2439 /** 2440 + * struct hl_debugfs_cfg_access_entry - single debugfs config access object, member of 2441 + * hl_debugfs_cfg_access. 2442 + * @seconds_since_epoch: seconds since January 1, 1970, used for time comparisons. 2443 + * @debugfs_type: the debugfs operation requested, can be READ32, WRITE32, READ64 or WRITE64. 2444 + * @addr: the requested address to access. 2445 + * @valid: if set, this entry has valid data for dumping at interrupt time. 2446 + */ 2447 + struct hl_debugfs_cfg_access_entry { 2448 + ktime_t seconds_since_epoch; 2449 + enum debugfs_access_type debugfs_type; 2450 + u64 addr; 2451 + bool valid; 2452 + }; 2453 + 2454 + /** 2455 + * struct hl_debugfs_cfg_access - saves debugfs config region access requests history. 2456 + * @cfg_access_list: list of objects describing config region access requests. 2457 + * @head: next valid index to add new entry to in cfg_access_list. 2458 + */ 2459 + struct hl_debugfs_cfg_access { 2460 + struct hl_debugfs_cfg_access_entry cfg_access_list[HL_DBGFS_CFG_ACCESS_HIST_LEN]; 2461 + u32 head; 2462 + spinlock_t lock; /* protects head and entries */ 2463 + }; 2464 + 2465 + /** 2442 2466 * struct hl_hw_obj_name_entry - single hw object name, member of 2443 2467 * hl_state_dump_specs 2444 2468 * @node: link to the containing hash table ··· 3309 3281 * @hl_chip_info: ASIC's sensors information. 3310 3282 * @device_status_description: device status description. 3311 3283 * @hl_debugfs: device's debugfs manager. 3284 + * @debugfs_cfg_accesses: list of last debugfs config region accesses. 3312 3285 * @cb_pool: list of pre allocated CBs. 3313 3286 * @cb_pool_lock: protects the CB pool. 3314 3287 * @internal_cb_pool_virt_addr: internal command buffer pool virtual address. ··· 3490 3461 struct hwmon_chip_info *hl_chip_info; 3491 3462 3492 3463 struct hl_dbg_device_entry hl_debugfs; 3464 + struct hl_debugfs_cfg_access debugfs_cfg_accesses; 3493 3465 3494 3466 struct list_head cb_pool; 3495 3467 spinlock_t cb_pool_lock; ··· 4140 4110 void hl_debugfs_remove_ctx_mem_hash(struct hl_device *hdev, struct hl_ctx *ctx); 4141 4111 void hl_debugfs_set_state_dump(struct hl_device *hdev, char *data, 4142 4112 unsigned long length); 4113 + void hl_debugfs_cfg_access_history_dump(struct hl_device *hdev); 4143 4114 4144 4115 #else 4145 4116 ··· 4213 4182 4214 4183 static inline void hl_debugfs_set_state_dump(struct hl_device *hdev, 4215 4184 char *data, unsigned long length) 4185 + { 4186 + } 4187 + 4188 + static inline void hl_debugfs_cfg_access_history_dump(struct hl_device *hdev) 4216 4189 { 4217 4190 } 4218 4191
+1
drivers/accel/habanalabs/gaudi2/gaudi2.c
··· 10610 10610 if (event_mask & HL_NOTIFIER_EVENT_GENERAL_HW_ERR) 10611 10611 hl_handle_critical_hw_err(hdev, event_type, &event_mask); 10612 10612 10613 + hl_debugfs_cfg_access_history_dump(hdev); 10613 10614 event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET; 10614 10615 hl_device_cond_reset(hdev, reset_flags, event_mask); 10615 10616 }