Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/xe/pf: Track adverse events notifications from GuC

When thresholds used to monitor VFs activities are configured,
then GuC may send GUC2PF_ADVERSE_EVENT messages informing the
PF driver about exceeded thresholds. Start handling such messages.

Reviewed-by: Piotr Piórkowski <piotr.piorkowski@intel.com>
Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240514190015.2172-8-michal.wajdeczko@intel.com

+206
+1
drivers/gpu/drm/xe/Makefile
··· 164 164 xe_gt_sriov_pf_config.o \ 165 165 xe_gt_sriov_pf_control.o \ 166 166 xe_gt_sriov_pf_debugfs.o \ 167 + xe_gt_sriov_pf_monitor.o \ 167 168 xe_gt_sriov_pf_policy.o \ 168 169 xe_gt_sriov_pf_service.o \ 169 170 xe_lmtt.o \
+147
drivers/gpu/drm/xe/xe_gt_sriov_pf_monitor.c
··· 1 + // SPDX-License-Identifier: MIT 2 + /* 3 + * Copyright © 2023-2024 Intel Corporation 4 + */ 5 + 6 + #include "abi/guc_actions_sriov_abi.h" 7 + #include "abi/guc_messages_abi.h" 8 + 9 + #include "xe_gt_sriov_pf_config.h" 10 + #include "xe_gt_sriov_pf_helpers.h" 11 + #include "xe_gt_sriov_pf_monitor.h" 12 + #include "xe_gt_sriov_printk.h" 13 + #include "xe_guc_klv_helpers.h" 14 + #include "xe_guc_klv_thresholds_set.h" 15 + 16 + /** 17 + * xe_gt_sriov_pf_monitor_flr - Cleanup VF data after VF FLR. 18 + * @gt: the &xe_gt 19 + * @vfid: the VF identifier 20 + * 21 + * On FLR this function will reset all event data related to the VF. 22 + * This function is for PF only. 23 + */ 24 + void xe_gt_sriov_pf_monitor_flr(struct xe_gt *gt, u32 vfid) 25 + { 26 + int e; 27 + 28 + xe_gt_assert(gt, IS_SRIOV_PF(gt_to_xe(gt))); 29 + xe_gt_sriov_pf_assert_vfid(gt, vfid); 30 + 31 + for (e = 0; e < XE_GUC_KLV_NUM_THRESHOLDS; e++) 32 + gt->sriov.pf.vfs[vfid].monitor.guc.events[e] = 0; 33 + } 34 + 35 + static void pf_update_event_counter(struct xe_gt *gt, u32 vfid, 36 + enum xe_guc_klv_threshold_index e) 37 + { 38 + xe_gt_assert(gt, IS_SRIOV_PF(gt_to_xe(gt))); 39 + xe_gt_assert(gt, e < XE_GUC_KLV_NUM_THRESHOLDS); 40 + 41 + gt->sriov.pf.vfs[vfid].monitor.guc.events[e]++; 42 + } 43 + 44 + static int pf_handle_vf_threshold_event(struct xe_gt *gt, u32 vfid, u32 threshold) 45 + { 46 + char origin[8]; 47 + int e; 48 + 49 + e = xe_guc_klv_threshold_key_to_index(threshold); 50 + xe_sriov_function_name(vfid, origin, sizeof(origin)); 51 + 52 + /* was there a new KEY added that we missed? */ 53 + if (unlikely(e < 0)) { 54 + xe_gt_sriov_notice(gt, "unknown threshold key %#x reported for %s\n", 55 + threshold, origin); 56 + return -ENOTCONN; 57 + } 58 + 59 + xe_gt_sriov_dbg(gt, "%s exceeded threshold %u %s\n", 60 + origin, xe_gt_sriov_pf_config_get_threshold(gt, vfid, e), 61 + xe_guc_klv_key_to_string(threshold)); 62 + 63 + pf_update_event_counter(gt, vfid, e); 64 + 65 + return 0; 66 + } 67 + 68 + /** 69 + * xe_gt_sriov_pf_monitor_process_guc2pf - Handle adverse event notification from the GuC. 70 + * @gt: the &xe_gt 71 + * @msg: G2H event message 72 + * @len: length of the message 73 + * 74 + * This function is intended for PF only. 75 + * 76 + * Return: 0 on success or a negative error code on failure. 77 + */ 78 + int xe_gt_sriov_pf_monitor_process_guc2pf(struct xe_gt *gt, const u32 *msg, u32 len) 79 + { 80 + struct xe_device *xe = gt_to_xe(gt); 81 + u32 vfid; 82 + u32 threshold; 83 + 84 + xe_gt_assert(gt, len >= GUC_HXG_MSG_MIN_LEN); 85 + xe_gt_assert(gt, FIELD_GET(GUC_HXG_MSG_0_ORIGIN, msg[0]) == GUC_HXG_ORIGIN_GUC); 86 + xe_gt_assert(gt, FIELD_GET(GUC_HXG_MSG_0_TYPE, msg[0]) == GUC_HXG_TYPE_EVENT); 87 + xe_gt_assert(gt, FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, msg[0]) == 88 + GUC_ACTION_GUC2PF_ADVERSE_EVENT); 89 + 90 + if (unlikely(!IS_SRIOV_PF(xe))) 91 + return -EPROTO; 92 + 93 + if (unlikely(FIELD_GET(GUC2PF_ADVERSE_EVENT_EVENT_MSG_0_MBZ, msg[0]))) 94 + return -EPFNOSUPPORT; 95 + 96 + if (unlikely(len < GUC2PF_ADVERSE_EVENT_EVENT_MSG_LEN)) 97 + return -EPROTO; 98 + 99 + vfid = FIELD_GET(GUC2PF_ADVERSE_EVENT_EVENT_MSG_1_VFID, msg[1]); 100 + threshold = FIELD_GET(GUC2PF_ADVERSE_EVENT_EVENT_MSG_2_THRESHOLD, msg[2]); 101 + 102 + if (unlikely(vfid > xe_gt_sriov_pf_get_totalvfs(gt))) 103 + return -EINVAL; 104 + 105 + return pf_handle_vf_threshold_event(gt, vfid, threshold); 106 + } 107 + 108 + /** 109 + * xe_gt_sriov_pf_monitor_print_events - Print adverse events counters. 110 + * @gt: the &xe_gt to print events from 111 + * @p: the &drm_printer 112 + * 113 + * Print adverse events counters for all VFs. 114 + * VFs with no events are not printed. 115 + * 116 + * This function can only be called on PF. 117 + */ 118 + void xe_gt_sriov_pf_monitor_print_events(struct xe_gt *gt, struct drm_printer *p) 119 + { 120 + unsigned int n, total_vfs = xe_gt_sriov_pf_get_totalvfs(gt); 121 + const struct xe_gt_sriov_monitor *data; 122 + int e; 123 + 124 + xe_gt_assert(gt, IS_SRIOV_PF(gt_to_xe(gt))); 125 + 126 + for (n = 1; n <= total_vfs; n++) { 127 + data = &gt->sriov.pf.vfs[n].monitor; 128 + 129 + for (e = 0; e < XE_GUC_KLV_NUM_THRESHOLDS; e++) 130 + if (data->guc.events[e]) 131 + break; 132 + 133 + /* skip empty unless in debug mode */ 134 + if (e >= XE_GUC_KLV_NUM_THRESHOLDS && 135 + !IS_ENABLED(CONFIG_DRM_XE_DEBUG_SRIOV)) 136 + continue; 137 + 138 + #define __format(...) "%s:%u " 139 + #define __value(TAG, NAME, ...) , #NAME, data->guc.events[MAKE_XE_GUC_KLV_THRESHOLD_INDEX(TAG)] 140 + 141 + drm_printf(p, "VF%u:\t" MAKE_XE_GUC_KLV_THRESHOLDS_SET(__format) "\n", 142 + n MAKE_XE_GUC_KLV_THRESHOLDS_SET(__value)); 143 + 144 + #undef __format 145 + #undef __value 146 + } 147 + }
+27
drivers/gpu/drm/xe/xe_gt_sriov_pf_monitor.h
··· 1 + /* SPDX-License-Identifier: MIT */ 2 + /* 3 + * Copyright © 2023-2024 Intel Corporation 4 + */ 5 + 6 + #ifndef _XE_GT_SRIOV_PF_MONITOR_H_ 7 + #define _XE_GT_SRIOV_PF_MONITOR_H_ 8 + 9 + #include <linux/errno.h> 10 + #include <linux/types.h> 11 + 12 + struct drm_printer; 13 + struct xe_gt; 14 + 15 + void xe_gt_sriov_pf_monitor_flr(struct xe_gt *gt, u32 vfid); 16 + void xe_gt_sriov_pf_monitor_print_events(struct xe_gt *gt, struct drm_printer *p); 17 + 18 + #ifdef CONFIG_PCI_IOV 19 + int xe_gt_sriov_pf_monitor_process_guc2pf(struct xe_gt *gt, const u32 *msg, u32 len); 20 + #else 21 + static inline int xe_gt_sriov_pf_monitor_process_guc2pf(struct xe_gt *gt, const u32 *msg, u32 len) 22 + { 23 + return -EPROTO; 24 + } 25 + #endif 26 + 27 + #endif
+22
drivers/gpu/drm/xe/xe_gt_sriov_pf_monitor_types.h
··· 1 + /* SPDX-License-Identifier: MIT */ 2 + /* 3 + * Copyright © 2023-2024 Intel Corporation 4 + */ 5 + 6 + #ifndef _XE_GT_SRIOV_PF_MONITOR_TYPES_H_ 7 + #define _XE_GT_SRIOV_PF_MONITOR_TYPES_H_ 8 + 9 + #include "xe_guc_klv_thresholds_set_types.h" 10 + 11 + /** 12 + * struct xe_gt_sriov_monitor - GT level per-VF monitoring data. 13 + */ 14 + struct xe_gt_sriov_monitor { 15 + /** @guc: monitoring data related to the GuC. */ 16 + struct { 17 + /** @guc.events: number of adverse events reported by the GuC. */ 18 + unsigned int events[XE_GUC_KLV_NUM_THRESHOLDS]; 19 + } guc; 20 + }; 21 + 22 + #endif
+5
drivers/gpu/drm/xe/xe_gt_sriov_pf_types.h
··· 9 9 #include <linux/types.h> 10 10 11 11 #include "xe_gt_sriov_pf_config_types.h" 12 + #include "xe_gt_sriov_pf_monitor_types.h" 12 13 #include "xe_gt_sriov_pf_policy_types.h" 13 14 #include "xe_gt_sriov_pf_service_types.h" 14 15 ··· 19 18 struct xe_gt_sriov_metadata { 20 19 /** @config: per-VF provisioning data. */ 21 20 struct xe_gt_sriov_config config; 21 + 22 + /** @monitor: per-VF monitoring data. */ 23 + struct xe_gt_sriov_monitor monitor; 24 + 22 25 /** @version: negotiated VF/PF ABI version */ 23 26 struct xe_gt_sriov_pf_service_version version; 24 27 };
+4
drivers/gpu/drm/xe/xe_guc_ct.c
··· 22 22 #include "xe_gt_pagefault.h" 23 23 #include "xe_gt_printk.h" 24 24 #include "xe_gt_sriov_pf_control.h" 25 + #include "xe_gt_sriov_pf_monitor.h" 25 26 #include "xe_gt_tlb_invalidation.h" 26 27 #include "xe_guc.h" 27 28 #include "xe_guc_relay.h" ··· 1071 1070 break; 1072 1071 case GUC_ACTION_GUC2PF_VF_STATE_NOTIFY: 1073 1072 ret = xe_gt_sriov_pf_control_process_guc2pf(gt, hxg, hxg_len); 1073 + break; 1074 + case GUC_ACTION_GUC2PF_ADVERSE_EVENT: 1075 + ret = xe_gt_sriov_pf_monitor_process_guc2pf(gt, hxg, hxg_len); 1074 1076 break; 1075 1077 default: 1076 1078 xe_gt_err(gt, "unexpected G2H action 0x%04x\n", action);