Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/xe/oa: Add OAR support

Add OAR support to allow userspace to execute MI_REPORT_PERF_COUNT on
render engines. Configuration batches are used to program the OAR unit, as
well as modifying the render engine context image of a specified exec queue
(to have correct register values when that context switches in).

v2: Rename/refactor xe_oa_modify_self (Umesh)
v3: Move IS_MI_LRI_CMD() into xe_oa.c (Michal)

Acked-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Reviewed-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240618014609.3233427-11-ashutosh.dixit@intel.com

+206 -5
+1
drivers/gpu/drm/xe/instructions/xe_mi_commands.h
··· 45 45 #define MI_LRI_MMIO_REMAP_EN REG_BIT(17) 46 46 #define MI_LRI_NUM_REGS(x) XE_INSTR_NUM_DW(2 * (x) + 1) 47 47 #define MI_LRI_FORCE_POSTED REG_BIT(12) 48 + #define MI_LRI_LEN(x) (((x) & 0xff) + 1) 48 49 49 50 #define MI_FLUSH_DW __MI_INSTR(0x26) 50 51 #define MI_FLUSH_DW_STORE_INDEX REG_BIT(21)
+1
drivers/gpu/drm/xe/regs/xe_engine_regs.h
··· 129 129 #define RING_EXECLIST_STATUS_HI(base) XE_REG((base) + 0x234 + 4) 130 130 131 131 #define RING_CONTEXT_CONTROL(base) XE_REG((base) + 0x244, XE_REG_OPTION_MASKED) 132 + #define CTX_CTRL_OAC_CONTEXT_ENABLE REG_BIT(8) 132 133 #define CTX_CTRL_INDIRECT_RING_STATE_ENABLE REG_BIT(4) 133 134 #define CTX_CTRL_INHIBIT_SYN_CTX_SWITCH REG_BIT(3) 134 135 #define CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT REG_BIT(0)
+6 -5
drivers/gpu/drm/xe/xe_lrc.c
··· 651 651 652 652 /* Make the magic macros work */ 653 653 #define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset 654 + #define __xe_lrc_regs_offset xe_lrc_regs_offset 654 655 655 656 #define LRC_SEQNO_PPHWSP_OFFSET 512 656 657 #define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8) 657 658 #define LRC_CTX_JOB_TIMESTAMP_OFFSET (LRC_START_SEQNO_PPHWSP_OFFSET + 8) 658 659 #define LRC_PARALLEL_PPHWSP_OFFSET 2048 659 660 #define LRC_PPHWSP_SIZE SZ_4K 661 + 662 + u32 xe_lrc_regs_offset(struct xe_lrc *lrc) 663 + { 664 + return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE; 665 + } 660 666 661 667 static size_t lrc_reg_size(struct xe_device *xe) 662 668 { ··· 699 693 { 700 694 /* The parallel is stored in the driver-defined portion of PPHWSP */ 701 695 return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET; 702 - } 703 - 704 - static inline u32 __xe_lrc_regs_offset(struct xe_lrc *lrc) 705 - { 706 - return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE; 707 696 } 708 697 709 698 static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc)
+1
drivers/gpu/drm/xe/xe_lrc.h
··· 52 52 53 53 size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class); 54 54 u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc); 55 + u32 xe_lrc_regs_offset(struct xe_lrc *lrc); 55 56 56 57 void xe_lrc_set_ring_tail(struct xe_lrc *lrc, u32 tail); 57 58 u32 xe_lrc_ring_tail(struct xe_lrc *lrc);
+193
drivers/gpu/drm/xe/xe_oa.c
··· 13 13 #include <drm/xe_drm.h> 14 14 15 15 #include "instructions/xe_mi_commands.h" 16 + #include "regs/xe_engine_regs.h" 16 17 #include "regs/xe_gt_regs.h" 18 + #include "regs/xe_lrc_layout.h" 17 19 #include "regs/xe_oa_regs.h" 18 20 #include "xe_assert.h" 19 21 #include "xe_bb.h" ··· 26 24 #include "xe_gt.h" 27 25 #include "xe_gt_mcr.h" 28 26 #include "xe_gt_printk.h" 27 + #include "xe_lrc.h" 29 28 #include "xe_macros.h" 30 29 #include "xe_mmio.h" 31 30 #include "xe_oa.h" ··· 59 56 60 57 struct kref ref; 61 58 struct rcu_head rcu; 59 + }; 60 + 61 + struct flex { 62 + struct xe_reg reg; 63 + u32 offset; 64 + u32 value; 62 65 }; 63 66 64 67 struct xe_oa_open_param { ··· 607 598 free_oa_config_bo(oa_bo); 608 599 } 609 600 601 + static void xe_oa_store_flex(struct xe_oa_stream *stream, struct xe_lrc *lrc, 602 + struct xe_bb *bb, const struct flex *flex, u32 count) 603 + { 604 + u32 offset = xe_bo_ggtt_addr(lrc->bo); 605 + 606 + do { 607 + bb->cs[bb->len++] = MI_STORE_DATA_IMM | BIT(22) /* GGTT */ | 2; 608 + bb->cs[bb->len++] = offset + flex->offset * sizeof(u32); 609 + bb->cs[bb->len++] = 0; 610 + bb->cs[bb->len++] = flex->value; 611 + 612 + } while (flex++, --count); 613 + } 614 + 615 + static int xe_oa_modify_ctx_image(struct xe_oa_stream *stream, struct xe_lrc *lrc, 616 + const struct flex *flex, u32 count) 617 + { 618 + struct xe_bb *bb; 619 + int err; 620 + 621 + bb = xe_bb_new(stream->gt, 4 * count, false); 622 + if (IS_ERR(bb)) { 623 + err = PTR_ERR(bb); 624 + goto exit; 625 + } 626 + 627 + xe_oa_store_flex(stream, lrc, bb, flex, count); 628 + 629 + err = xe_oa_submit_bb(stream, bb); 630 + xe_bb_free(bb, NULL); 631 + exit: 632 + return err; 633 + } 634 + 635 + static int xe_oa_load_with_lri(struct xe_oa_stream *stream, struct xe_oa_reg *reg_lri) 636 + { 637 + struct xe_bb *bb; 638 + int err; 639 + 640 + bb = xe_bb_new(stream->gt, 3, false); 641 + if (IS_ERR(bb)) { 642 + err = PTR_ERR(bb); 643 + goto exit; 644 + } 645 + 646 + write_cs_mi_lri(bb, reg_lri, 1); 647 + 648 + err = xe_oa_submit_bb(stream, bb); 649 + xe_bb_free(bb, NULL); 650 + exit: 651 + return err; 652 + } 653 + 654 + static int xe_oa_configure_oar_context(struct xe_oa_stream *stream, bool enable) 655 + { 656 + const struct xe_oa_format *format = stream->oa_buffer.format; 657 + struct xe_lrc *lrc = stream->exec_q->lrc[0]; 658 + u32 regs_offset = xe_lrc_regs_offset(lrc) / sizeof(u32); 659 + u32 oacontrol = __format_to_oactrl(format, OAR_OACONTROL_COUNTER_SEL_MASK) | 660 + (enable ? OAR_OACONTROL_COUNTER_ENABLE : 0); 661 + 662 + struct flex regs_context[] = { 663 + { 664 + OACTXCONTROL(stream->hwe->mmio_base), 665 + stream->oa->ctx_oactxctrl_offset[stream->hwe->class] + 1, 666 + enable ? OA_COUNTER_RESUME : 0, 667 + }, 668 + { 669 + RING_CONTEXT_CONTROL(stream->hwe->mmio_base), 670 + regs_offset + CTX_CONTEXT_CONTROL, 671 + _MASKED_FIELD(CTX_CTRL_OAC_CONTEXT_ENABLE, 672 + enable ? CTX_CTRL_OAC_CONTEXT_ENABLE : 0) 673 + }, 674 + }; 675 + struct xe_oa_reg reg_lri = { OAR_OACONTROL, oacontrol }; 676 + int err; 677 + 678 + /* Modify stream hwe context image with regs_context */ 679 + err = xe_oa_modify_ctx_image(stream, stream->exec_q->lrc[0], 680 + regs_context, ARRAY_SIZE(regs_context)); 681 + if (err) 682 + return err; 683 + 684 + /* Apply reg_lri using LRI */ 685 + return xe_oa_load_with_lri(stream, &reg_lri); 686 + } 687 + 610 688 #define HAS_OA_BPC_REPORTING(xe) (GRAPHICS_VERx100(xe) >= 1255) 611 689 612 690 static void xe_oa_disable_metric_set(struct xe_oa_stream *stream) ··· 710 614 xe_gt_mcr_multicast_write(stream->gt, ROW_CHICKEN2, 711 615 _MASKED_BIT_DISABLE(DISABLE_DOP_GATING)); 712 616 } 617 + 618 + /* disable the context save/restore or OAR counters */ 619 + if (stream->exec_q) 620 + xe_oa_configure_oar_context(stream, false); 713 621 714 622 /* Make sure we disable noa to save power. */ 715 623 xe_mmio_rmw32(stream->gt, RPM_CONFIG1, GT_NOA_ENABLE, 0); ··· 843 743 static int xe_oa_enable_metric_set(struct xe_oa_stream *stream) 844 744 { 845 745 u32 oa_debug, sqcnt1; 746 + int ret; 846 747 847 748 /* 848 749 * Wa_1508761755:xehpsdv, dg2 ··· 880 779 (HAS_OA_BPC_REPORTING(stream->oa->xe) ? SQCNT1_OABPC : 0); 881 780 882 781 xe_mmio_rmw32(stream->gt, XELPMP_SQCNT1, 0, sqcnt1); 782 + 783 + if (stream->exec_q) { 784 + ret = xe_oa_configure_oar_context(stream, true); 785 + if (ret) 786 + return ret; 787 + } 883 788 884 789 return xe_oa_emit_oa_config(stream); 885 790 } ··· 1056 949 .unlocked_ioctl = xe_oa_ioctl, 1057 950 }; 1058 951 952 + static bool engine_supports_mi_query(struct xe_hw_engine *hwe) 953 + { 954 + return hwe->class == XE_ENGINE_CLASS_RENDER || 955 + hwe->class == XE_ENGINE_CLASS_COMPUTE; 956 + } 957 + 958 + static bool xe_oa_find_reg_in_lri(u32 *state, u32 reg, u32 *offset, u32 end) 959 + { 960 + u32 idx = *offset; 961 + u32 len = min(MI_LRI_LEN(state[idx]) + idx, end); 962 + bool found = false; 963 + 964 + idx++; 965 + for (; idx < len; idx += 2) { 966 + if (state[idx] == reg) { 967 + found = true; 968 + break; 969 + } 970 + } 971 + 972 + *offset = idx; 973 + return found; 974 + } 975 + 976 + #define IS_MI_LRI_CMD(x) (REG_FIELD_GET(MI_OPCODE, (x)) == \ 977 + REG_FIELD_GET(MI_OPCODE, MI_LOAD_REGISTER_IMM)) 978 + 979 + static u32 xe_oa_context_image_offset(struct xe_oa_stream *stream, u32 reg) 980 + { 981 + struct xe_lrc *lrc = stream->exec_q->lrc[0]; 982 + u32 len = (xe_gt_lrc_size(stream->gt, stream->hwe->class) + 983 + lrc->ring.size) / sizeof(u32); 984 + u32 offset = xe_lrc_regs_offset(lrc) / sizeof(u32); 985 + u32 *state = (u32 *)lrc->bo->vmap.vaddr; 986 + 987 + if (drm_WARN_ON(&stream->oa->xe->drm, !state)) 988 + return U32_MAX; 989 + 990 + for (; offset < len; ) { 991 + if (IS_MI_LRI_CMD(state[offset])) { 992 + /* 993 + * We expect reg-value pairs in MI_LRI command, so 994 + * MI_LRI_LEN() should be even 995 + */ 996 + drm_WARN_ON(&stream->oa->xe->drm, 997 + MI_LRI_LEN(state[offset]) & 0x1); 998 + 999 + if (xe_oa_find_reg_in_lri(state, reg, &offset, len)) 1000 + break; 1001 + } else { 1002 + offset++; 1003 + } 1004 + } 1005 + 1006 + return offset < len ? offset : U32_MAX; 1007 + } 1008 + 1009 + static int xe_oa_set_ctx_ctrl_offset(struct xe_oa_stream *stream) 1010 + { 1011 + struct xe_reg reg = OACTXCONTROL(stream->hwe->mmio_base); 1012 + u32 offset = stream->oa->ctx_oactxctrl_offset[stream->hwe->class]; 1013 + 1014 + /* Do this only once. Failure is stored as offset of U32_MAX */ 1015 + if (offset) 1016 + goto exit; 1017 + 1018 + offset = xe_oa_context_image_offset(stream, reg.addr); 1019 + stream->oa->ctx_oactxctrl_offset[stream->hwe->class] = offset; 1020 + 1021 + drm_dbg(&stream->oa->xe->drm, "%s oa ctx control at 0x%08x dword offset\n", 1022 + stream->hwe->name, offset); 1023 + exit: 1024 + return offset && offset != U32_MAX ? 0 : -ENODEV; 1025 + } 1026 + 1059 1027 static int xe_oa_stream_init(struct xe_oa_stream *stream, 1060 1028 struct xe_oa_open_param *param) 1061 1029 { ··· 1147 965 stream->sample = param->sample; 1148 966 stream->periodic = param->period_exponent > 0; 1149 967 stream->period_exponent = param->period_exponent; 968 + 969 + if (stream->exec_q && engine_supports_mi_query(stream->hwe)) { 970 + /* If we don't find the context offset, just return error */ 971 + ret = xe_oa_set_ctx_ctrl_offset(stream); 972 + if (ret) { 973 + drm_err(&stream->oa->xe->drm, 974 + "xe_oa_set_ctx_ctrl_offset failed for %s\n", 975 + stream->hwe->name); 976 + goto exit; 977 + } 978 + } 1150 979 1151 980 stream->oa_config = xe_oa_get_oa_config(stream->oa, param->metric_set); 1152 981 if (!stream->oa_config) {
+4
drivers/gpu/drm/xe/xe_oa_types.h
··· 13 13 14 14 #include <drm/xe_drm.h> 15 15 #include "regs/xe_reg_defs.h" 16 + #include "xe_hw_engine_types.h" 16 17 17 18 #define XE_OA_BUFFER_SIZE SZ_16M 18 19 ··· 137 136 138 137 /** @metrics_idr: List of dynamic configurations (struct xe_oa_config) */ 139 138 struct idr metrics_idr; 139 + 140 + /** @ctx_oactxctrl_offset: offset of OACTXCONTROL register in context image */ 141 + u32 ctx_oactxctrl_offset[XE_ENGINE_CLASS_MAX]; 140 142 141 143 /** @oa_formats: tracks all OA formats across platforms */ 142 144 const struct xe_oa_format *oa_formats;