Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdkfd: add debug and runtime enable interface

Introduce the GPU debug operations interface.

For ROCm-GDB to extend the GNU Debugger's ability to inspect the AMD GPU
instruction set, provide the necessary interface to allow the debugger
to HW debug-mode set and query exceptions per HSA queue, process or
device.

The runtime_enable interface coordinates exception handling with the
HSA runtime.

Usage is available in the kern docs at uapi/linux/kfd_ioctl.h.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Jonathan Kim and committed by
Alex Deucher
4f98cf2b ba3c87ff

+715 -1
+48
drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
··· 2729 2729 return ret; 2730 2730 } 2731 2731 2732 + static int kfd_ioctl_runtime_enable(struct file *filep, struct kfd_process *p, void *data) 2733 + { 2734 + return 0; 2735 + } 2736 + 2737 + static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, void *data) 2738 + { 2739 + struct kfd_ioctl_dbg_trap_args *args = data; 2740 + int r = 0; 2741 + 2742 + if (sched_policy == KFD_SCHED_POLICY_NO_HWS) { 2743 + pr_err("Debugging does not support sched_policy %i", sched_policy); 2744 + return -EINVAL; 2745 + } 2746 + 2747 + switch (args->op) { 2748 + case KFD_IOC_DBG_TRAP_ENABLE: 2749 + case KFD_IOC_DBG_TRAP_DISABLE: 2750 + case KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT: 2751 + case KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED: 2752 + case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE: 2753 + case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE: 2754 + case KFD_IOC_DBG_TRAP_SUSPEND_QUEUES: 2755 + case KFD_IOC_DBG_TRAP_RESUME_QUEUES: 2756 + case KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH: 2757 + case KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH: 2758 + case KFD_IOC_DBG_TRAP_SET_FLAGS: 2759 + case KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT: 2760 + case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO: 2761 + case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT: 2762 + case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT: 2763 + pr_warn("Debugging not supported yet\n"); 2764 + r = -EACCES; 2765 + break; 2766 + default: 2767 + pr_err("Invalid option: %i\n", args->op); 2768 + r = -EINVAL; 2769 + } 2770 + 2771 + return r; 2772 + } 2773 + 2732 2774 #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \ 2733 2775 [_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \ 2734 2776 .cmd_drv = 0, .name = #ioctl} ··· 2883 2841 2884 2842 AMDKFD_IOCTL_DEF(AMDKFD_IOC_EXPORT_DMABUF, 2885 2843 kfd_ioctl_export_dmabuf, 0), 2844 + 2845 + AMDKFD_IOCTL_DEF(AMDKFD_IOC_RUNTIME_ENABLE, 2846 + kfd_ioctl_runtime_enable, 0), 2847 + 2848 + AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_TRAP, 2849 + kfd_ioctl_set_debug_trap, 0), 2886 2850 }; 2887 2851 2888 2852 #define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls)
+667 -1
include/uapi/linux/kfd_ioctl.h
··· 110 110 __u32 pad; 111 111 }; 112 112 113 + struct kfd_dbg_device_info_entry { 114 + __u64 exception_status; 115 + __u64 lds_base; 116 + __u64 lds_limit; 117 + __u64 scratch_base; 118 + __u64 scratch_limit; 119 + __u64 gpuvm_base; 120 + __u64 gpuvm_limit; 121 + __u32 gpu_id; 122 + __u32 location_id; 123 + __u32 vendor_id; 124 + __u32 device_id; 125 + __u32 revision_id; 126 + __u32 subsystem_vendor_id; 127 + __u32 subsystem_device_id; 128 + __u32 fw_version; 129 + __u32 gfx_target_version; 130 + __u32 simd_count; 131 + __u32 max_waves_per_simd; 132 + __u32 array_count; 133 + __u32 simd_arrays_per_engine; 134 + __u32 num_xcc; 135 + __u32 capability; 136 + __u32 debug_prop; 137 + }; 138 + 113 139 /* For kfd_ioctl_set_memory_policy_args.default_policy and alternate_policy */ 114 140 #define KFD_IOC_CACHE_POLICY_COHERENT 0 115 141 #define KFD_IOC_CACHE_POLICY_NONCOHERENT 1 ··· 799 773 __s32 xnack_enabled; 800 774 }; 801 775 776 + /* Wave launch override modes */ 777 + enum kfd_dbg_trap_override_mode { 778 + KFD_DBG_TRAP_OVERRIDE_OR = 0, 779 + KFD_DBG_TRAP_OVERRIDE_REPLACE = 1 780 + }; 781 + 782 + /* Wave launch overrides */ 783 + enum kfd_dbg_trap_mask { 784 + KFD_DBG_TRAP_MASK_FP_INVALID = 1, 785 + KFD_DBG_TRAP_MASK_FP_INPUT_DENORMAL = 2, 786 + KFD_DBG_TRAP_MASK_FP_DIVIDE_BY_ZERO = 4, 787 + KFD_DBG_TRAP_MASK_FP_OVERFLOW = 8, 788 + KFD_DBG_TRAP_MASK_FP_UNDERFLOW = 16, 789 + KFD_DBG_TRAP_MASK_FP_INEXACT = 32, 790 + KFD_DBG_TRAP_MASK_INT_DIVIDE_BY_ZERO = 64, 791 + KFD_DBG_TRAP_MASK_DBG_ADDRESS_WATCH = 128, 792 + KFD_DBG_TRAP_MASK_DBG_MEMORY_VIOLATION = 256, 793 + KFD_DBG_TRAP_MASK_TRAP_ON_WAVE_START = (1 << 30), 794 + KFD_DBG_TRAP_MASK_TRAP_ON_WAVE_END = (1 << 31) 795 + }; 796 + 797 + /* Wave launch modes */ 798 + enum kfd_dbg_trap_wave_launch_mode { 799 + KFD_DBG_TRAP_WAVE_LAUNCH_MODE_NORMAL = 0, 800 + KFD_DBG_TRAP_WAVE_LAUNCH_MODE_HALT = 1, 801 + KFD_DBG_TRAP_WAVE_LAUNCH_MODE_DEBUG = 3 802 + }; 803 + 804 + /* Address watch modes */ 805 + enum kfd_dbg_trap_address_watch_mode { 806 + KFD_DBG_TRAP_ADDRESS_WATCH_MODE_READ = 0, 807 + KFD_DBG_TRAP_ADDRESS_WATCH_MODE_NONREAD = 1, 808 + KFD_DBG_TRAP_ADDRESS_WATCH_MODE_ATOMIC = 2, 809 + KFD_DBG_TRAP_ADDRESS_WATCH_MODE_ALL = 3 810 + }; 811 + 812 + /* Additional wave settings */ 813 + enum kfd_dbg_trap_flags { 814 + KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP = 1, 815 + }; 816 + 817 + /* Trap exceptions */ 818 + enum kfd_dbg_trap_exception_code { 819 + EC_NONE = 0, 820 + /* per queue */ 821 + EC_QUEUE_WAVE_ABORT = 1, 822 + EC_QUEUE_WAVE_TRAP = 2, 823 + EC_QUEUE_WAVE_MATH_ERROR = 3, 824 + EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION = 4, 825 + EC_QUEUE_WAVE_MEMORY_VIOLATION = 5, 826 + EC_QUEUE_WAVE_APERTURE_VIOLATION = 6, 827 + EC_QUEUE_PACKET_DISPATCH_DIM_INVALID = 16, 828 + EC_QUEUE_PACKET_DISPATCH_GROUP_SEGMENT_SIZE_INVALID = 17, 829 + EC_QUEUE_PACKET_DISPATCH_CODE_INVALID = 18, 830 + EC_QUEUE_PACKET_RESERVED = 19, 831 + EC_QUEUE_PACKET_UNSUPPORTED = 20, 832 + EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID = 21, 833 + EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID = 22, 834 + EC_QUEUE_PACKET_VENDOR_UNSUPPORTED = 23, 835 + EC_QUEUE_PREEMPTION_ERROR = 30, 836 + EC_QUEUE_NEW = 31, 837 + /* per device */ 838 + EC_DEVICE_QUEUE_DELETE = 32, 839 + EC_DEVICE_MEMORY_VIOLATION = 33, 840 + EC_DEVICE_RAS_ERROR = 34, 841 + EC_DEVICE_FATAL_HALT = 35, 842 + EC_DEVICE_NEW = 36, 843 + /* per process */ 844 + EC_PROCESS_RUNTIME = 48, 845 + EC_PROCESS_DEVICE_REMOVE = 49, 846 + EC_MAX 847 + }; 848 + 849 + /* Mask generated by ecode in kfd_dbg_trap_exception_code */ 850 + #define KFD_EC_MASK(ecode) (1ULL << (ecode - 1)) 851 + 852 + /* Masks for exception code type checks below */ 853 + #define KFD_EC_MASK_QUEUE (KFD_EC_MASK(EC_QUEUE_WAVE_ABORT) | \ 854 + KFD_EC_MASK(EC_QUEUE_WAVE_TRAP) | \ 855 + KFD_EC_MASK(EC_QUEUE_WAVE_MATH_ERROR) | \ 856 + KFD_EC_MASK(EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION) | \ 857 + KFD_EC_MASK(EC_QUEUE_WAVE_MEMORY_VIOLATION) | \ 858 + KFD_EC_MASK(EC_QUEUE_WAVE_APERTURE_VIOLATION) | \ 859 + KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_DIM_INVALID) | \ 860 + KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_GROUP_SEGMENT_SIZE_INVALID) | \ 861 + KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_CODE_INVALID) | \ 862 + KFD_EC_MASK(EC_QUEUE_PACKET_RESERVED) | \ 863 + KFD_EC_MASK(EC_QUEUE_PACKET_UNSUPPORTED) | \ 864 + KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID) | \ 865 + KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID) | \ 866 + KFD_EC_MASK(EC_QUEUE_PACKET_VENDOR_UNSUPPORTED) | \ 867 + KFD_EC_MASK(EC_QUEUE_PREEMPTION_ERROR) | \ 868 + KFD_EC_MASK(EC_QUEUE_NEW)) 869 + #define KFD_EC_MASK_DEVICE (KFD_EC_MASK(EC_DEVICE_QUEUE_DELETE) | \ 870 + KFD_EC_MASK(EC_DEVICE_RAS_ERROR) | \ 871 + KFD_EC_MASK(EC_DEVICE_FATAL_HALT) | \ 872 + KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION) | \ 873 + KFD_EC_MASK(EC_DEVICE_NEW)) 874 + #define KFD_EC_MASK_PROCESS (KFD_EC_MASK(EC_PROCESS_RUNTIME) | \ 875 + KFD_EC_MASK(EC_PROCESS_DEVICE_REMOVE)) 876 + 877 + /* Checks for exception code types for KFD search */ 878 + #define KFD_DBG_EC_TYPE_IS_QUEUE(ecode) \ 879 + (!!(KFD_EC_MASK(ecode) & KFD_EC_MASK_QUEUE)) 880 + #define KFD_DBG_EC_TYPE_IS_DEVICE(ecode) \ 881 + (!!(KFD_EC_MASK(ecode) & KFD_EC_MASK_DEVICE)) 882 + #define KFD_DBG_EC_TYPE_IS_PROCESS(ecode) \ 883 + (!!(KFD_EC_MASK(ecode) & KFD_EC_MASK_PROCESS)) 884 + 885 + 886 + /* Runtime enable states */ 887 + enum kfd_dbg_runtime_state { 888 + DEBUG_RUNTIME_STATE_DISABLED = 0, 889 + DEBUG_RUNTIME_STATE_ENABLED = 1, 890 + DEBUG_RUNTIME_STATE_ENABLED_BUSY = 2, 891 + DEBUG_RUNTIME_STATE_ENABLED_ERROR = 3 892 + }; 893 + 894 + /* Runtime enable status */ 895 + struct kfd_runtime_info { 896 + __u64 r_debug; 897 + __u32 runtime_state; 898 + __u32 ttmp_setup; 899 + }; 900 + 901 + /* Enable modes for runtime enable */ 902 + #define KFD_RUNTIME_ENABLE_MODE_ENABLE_MASK 1 903 + #define KFD_RUNTIME_ENABLE_MODE_TTMP_SAVE_MASK 2 904 + 905 + /** 906 + * kfd_ioctl_runtime_enable_args - Arguments for runtime enable 907 + * 908 + * Coordinates debug exception signalling and debug device enablement with runtime. 909 + * 910 + * @r_debug - pointer to user struct for sharing information between ROCr and the debuggger 911 + * @mode_mask - mask to set mode 912 + * KFD_RUNTIME_ENABLE_MODE_ENABLE_MASK - enable runtime for debugging, otherwise disable 913 + * KFD_RUNTIME_ENABLE_MODE_TTMP_SAVE_MASK - enable trap temporary setup (ignore on disable) 914 + * @capabilities_mask - mask to notify runtime on what KFD supports 915 + * 916 + * Return - 0 on SUCCESS. 917 + * - EBUSY if runtime enable call already pending. 918 + * - EEXIST if user queues already active prior to call. 919 + * If process is debug enabled, runtime enable will enable debug devices and 920 + * wait for debugger process to send runtime exception EC_PROCESS_RUNTIME 921 + * to unblock - see kfd_ioctl_dbg_trap_args. 922 + * 923 + */ 924 + struct kfd_ioctl_runtime_enable_args { 925 + __u64 r_debug; 926 + __u32 mode_mask; 927 + __u32 capabilities_mask; 928 + }; 929 + 930 + /* Queue information */ 931 + struct kfd_queue_snapshot_entry { 932 + __u64 exception_status; 933 + __u64 ring_base_address; 934 + __u64 write_pointer_address; 935 + __u64 read_pointer_address; 936 + __u64 ctx_save_restore_address; 937 + __u32 queue_id; 938 + __u32 gpu_id; 939 + __u32 ring_size; 940 + __u32 queue_type; 941 + __u32 ctx_save_restore_area_size; 942 + __u32 reserved; 943 + }; 944 + 945 + /* Queue status return for suspend/resume */ 946 + #define KFD_DBG_QUEUE_ERROR_BIT 30 947 + #define KFD_DBG_QUEUE_INVALID_BIT 31 948 + #define KFD_DBG_QUEUE_ERROR_MASK (1 << KFD_DBG_QUEUE_ERROR_BIT) 949 + #define KFD_DBG_QUEUE_INVALID_MASK (1 << KFD_DBG_QUEUE_INVALID_BIT) 950 + 951 + /* Context save area header information */ 952 + struct kfd_context_save_area_header { 953 + struct { 954 + __u32 control_stack_offset; 955 + __u32 control_stack_size; 956 + __u32 wave_state_offset; 957 + __u32 wave_state_size; 958 + } wave_state; 959 + __u32 debug_offset; 960 + __u32 debug_size; 961 + __u64 err_payload_addr; 962 + __u32 err_event_id; 963 + __u32 reserved1; 964 + }; 965 + 966 + /* 967 + * Debug operations 968 + * 969 + * For specifics on usage and return values, see documentation per operation 970 + * below. Otherwise, generic error returns apply: 971 + * - ESRCH if the process to debug does not exist. 972 + * 973 + * - EINVAL (with KFD_IOC_DBG_TRAP_ENABLE exempt) if operation 974 + * KFD_IOC_DBG_TRAP_ENABLE has not succeeded prior. 975 + * Also returns this error if GPU hardware scheduling is not supported. 976 + * 977 + * - EPERM (with KFD_IOC_DBG_TRAP_DISABLE exempt) if target process is not 978 + * PTRACE_ATTACHED. KFD_IOC_DBG_TRAP_DISABLE is exempt to allow 979 + * clean up of debug mode as long as process is debug enabled. 980 + * 981 + * - EACCES if any DBG_HW_OP (debug hardware operation) is requested when 982 + * AMDKFD_IOC_RUNTIME_ENABLE has not succeeded prior. 983 + * 984 + * - ENODEV if any GPU does not support debugging on a DBG_HW_OP call. 985 + * 986 + * - Other errors may be returned when a DBG_HW_OP occurs while the GPU 987 + * is in a fatal state. 988 + * 989 + */ 990 + enum kfd_dbg_trap_operations { 991 + KFD_IOC_DBG_TRAP_ENABLE = 0, 992 + KFD_IOC_DBG_TRAP_DISABLE = 1, 993 + KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT = 2, 994 + KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED = 3, 995 + KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE = 4, /* DBG_HW_OP */ 996 + KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE = 5, /* DBG_HW_OP */ 997 + KFD_IOC_DBG_TRAP_SUSPEND_QUEUES = 6, /* DBG_HW_OP */ 998 + KFD_IOC_DBG_TRAP_RESUME_QUEUES = 7, /* DBG_HW_OP */ 999 + KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH = 8, /* DBG_HW_OP */ 1000 + KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH = 9, /* DBG_HW_OP */ 1001 + KFD_IOC_DBG_TRAP_SET_FLAGS = 10, 1002 + KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT = 11, 1003 + KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO = 12, 1004 + KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT = 13, 1005 + KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT = 14 1006 + }; 1007 + 1008 + /** 1009 + * kfd_ioctl_dbg_trap_enable_args 1010 + * 1011 + * Arguments for KFD_IOC_DBG_TRAP_ENABLE. 1012 + * 1013 + * Enables debug session for target process. Call @op KFD_IOC_DBG_TRAP_DISABLE in 1014 + * kfd_ioctl_dbg_trap_args to disable debug session. 1015 + * 1016 + * @exception_mask (IN) - exceptions to raise to the debugger 1017 + * @rinfo_ptr (IN) - pointer to runtime info buffer (see kfd_runtime_info) 1018 + * @rinfo_size (IN/OUT) - size of runtime info buffer in bytes 1019 + * @dbg_fd (IN) - fd the KFD will nofify the debugger with of raised 1020 + * exceptions set in exception_mask. 1021 + * 1022 + * Generic errors apply (see kfd_dbg_trap_operations). 1023 + * Return - 0 on SUCCESS. 1024 + * Copies KFD saved kfd_runtime_info to @rinfo_ptr on enable. 1025 + * Size of kfd_runtime saved by the KFD returned to @rinfo_size. 1026 + * - EBADF if KFD cannot get a reference to dbg_fd. 1027 + * - EFAULT if KFD cannot copy runtime info to rinfo_ptr. 1028 + * - EINVAL if target process is already debug enabled. 1029 + * 1030 + */ 1031 + struct kfd_ioctl_dbg_trap_enable_args { 1032 + __u64 exception_mask; 1033 + __u64 rinfo_ptr; 1034 + __u32 rinfo_size; 1035 + __u32 dbg_fd; 1036 + }; 1037 + 1038 + /** 1039 + * kfd_ioctl_dbg_trap_send_runtime_event_args 1040 + * 1041 + * 1042 + * Arguments for KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT. 1043 + * Raises exceptions to runtime. 1044 + * 1045 + * @exception_mask (IN) - exceptions to raise to runtime 1046 + * @gpu_id (IN) - target device id 1047 + * @queue_id (IN) - target queue id 1048 + * 1049 + * Generic errors apply (see kfd_dbg_trap_operations). 1050 + * Return - 0 on SUCCESS. 1051 + * - ENODEV if gpu_id not found. 1052 + * If exception_mask contains EC_PROCESS_RUNTIME, unblocks pending 1053 + * AMDKFD_IOC_RUNTIME_ENABLE call - see kfd_ioctl_runtime_enable_args. 1054 + * All other exceptions are raised to runtime through err_payload_addr. 1055 + * See kfd_context_save_area_header. 1056 + */ 1057 + struct kfd_ioctl_dbg_trap_send_runtime_event_args { 1058 + __u64 exception_mask; 1059 + __u32 gpu_id; 1060 + __u32 queue_id; 1061 + }; 1062 + 1063 + /** 1064 + * kfd_ioctl_dbg_trap_set_exceptions_enabled_args 1065 + * 1066 + * Arguments for KFD_IOC_SET_EXCEPTIONS_ENABLED 1067 + * Set new exceptions to be raised to the debugger. 1068 + * 1069 + * @exception_mask (IN) - new exceptions to raise the debugger 1070 + * 1071 + * Generic errors apply (see kfd_dbg_trap_operations). 1072 + * Return - 0 on SUCCESS. 1073 + */ 1074 + struct kfd_ioctl_dbg_trap_set_exceptions_enabled_args { 1075 + __u64 exception_mask; 1076 + }; 1077 + 1078 + /** 1079 + * kfd_ioctl_dbg_trap_set_wave_launch_override_args 1080 + * 1081 + * Arguments for KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE 1082 + * Enable HW exceptions to raise trap. 1083 + * 1084 + * @override_mode (IN) - see kfd_dbg_trap_override_mode 1085 + * @enable_mask (IN/OUT) - reference kfd_dbg_trap_mask. 1086 + * IN is the override modes requested to be enabled. 1087 + * OUT is referenced in Return below. 1088 + * @support_request_mask (IN/OUT) - reference kfd_dbg_trap_mask. 1089 + * IN is the override modes requested for support check. 1090 + * OUT is referenced in Return below. 1091 + * 1092 + * Generic errors apply (see kfd_dbg_trap_operations). 1093 + * Return - 0 on SUCCESS. 1094 + * Previous enablement is returned in @enable_mask. 1095 + * Actual override support is returned in @support_request_mask. 1096 + * - EINVAL if override mode is not supported. 1097 + * - EACCES if trap support requested is not actually supported. 1098 + * i.e. enable_mask (IN) is not a subset of support_request_mask (OUT). 1099 + * Otherwise it is considered a generic error (see kfd_dbg_trap_operations). 1100 + */ 1101 + struct kfd_ioctl_dbg_trap_set_wave_launch_override_args { 1102 + __u32 override_mode; 1103 + __u32 enable_mask; 1104 + __u32 support_request_mask; 1105 + __u32 pad; 1106 + }; 1107 + 1108 + /** 1109 + * kfd_ioctl_dbg_trap_set_wave_launch_mode_args 1110 + * 1111 + * Arguments for KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE 1112 + * Set wave launch mode. 1113 + * 1114 + * @mode (IN) - see kfd_dbg_trap_wave_launch_mode 1115 + * 1116 + * Generic errors apply (see kfd_dbg_trap_operations). 1117 + * Return - 0 on SUCCESS. 1118 + */ 1119 + struct kfd_ioctl_dbg_trap_set_wave_launch_mode_args { 1120 + __u32 launch_mode; 1121 + __u32 pad; 1122 + }; 1123 + 1124 + /** 1125 + * kfd_ioctl_dbg_trap_suspend_queues_ags 1126 + * 1127 + * Arguments for KFD_IOC_DBG_TRAP_SUSPEND_QUEUES 1128 + * Suspend queues. 1129 + * 1130 + * @exception_mask (IN) - raised exceptions to clear 1131 + * @queue_array_ptr (IN) - pointer to array of queue ids (u32 per queue id) 1132 + * to suspend 1133 + * @num_queues (IN) - number of queues to suspend in @queue_array_ptr 1134 + * @grace_period (IN) - wave time allowance before preemption 1135 + * per 1K GPU clock cycle unit 1136 + * 1137 + * Generic errors apply (see kfd_dbg_trap_operations). 1138 + * Destruction of a suspended queue is blocked until the queue is 1139 + * resumed. This allows the debugger to access queue information and 1140 + * the its context save area without running into a race condition on 1141 + * queue destruction. 1142 + * Automatically copies per queue context save area header information 1143 + * into the save area base 1144 + * (see kfd_queue_snapshot_entry and kfd_context_save_area_header). 1145 + * 1146 + * Return - Number of queues suspended on SUCCESS. 1147 + * . KFD_DBG_QUEUE_ERROR_MASK and KFD_DBG_QUEUE_INVALID_MASK masked 1148 + * for each queue id in @queue_array_ptr array reports unsuccessful 1149 + * suspend reason. 1150 + * KFD_DBG_QUEUE_ERROR_MASK = HW failure. 1151 + * KFD_DBG_QUEUE_INVALID_MASK = queue does not exist, is new or 1152 + * is being destroyed. 1153 + */ 1154 + struct kfd_ioctl_dbg_trap_suspend_queues_args { 1155 + __u64 exception_mask; 1156 + __u64 queue_array_ptr; 1157 + __u32 num_queues; 1158 + __u32 grace_period; 1159 + }; 1160 + 1161 + /** 1162 + * kfd_ioctl_dbg_trap_resume_queues_args 1163 + * 1164 + * Arguments for KFD_IOC_DBG_TRAP_RESUME_QUEUES 1165 + * Resume queues. 1166 + * 1167 + * @queue_array_ptr (IN) - pointer to array of queue ids (u32 per queue id) 1168 + * to resume 1169 + * @num_queues (IN) - number of queues to resume in @queue_array_ptr 1170 + * 1171 + * Generic errors apply (see kfd_dbg_trap_operations). 1172 + * Return - Number of queues resumed on SUCCESS. 1173 + * KFD_DBG_QUEUE_ERROR_MASK and KFD_DBG_QUEUE_INVALID_MASK mask 1174 + * for each queue id in @queue_array_ptr array reports unsuccessful 1175 + * resume reason. 1176 + * KFD_DBG_QUEUE_ERROR_MASK = HW failure. 1177 + * KFD_DBG_QUEUE_INVALID_MASK = queue does not exist. 1178 + */ 1179 + struct kfd_ioctl_dbg_trap_resume_queues_args { 1180 + __u64 queue_array_ptr; 1181 + __u32 num_queues; 1182 + __u32 pad; 1183 + }; 1184 + 1185 + /** 1186 + * kfd_ioctl_dbg_trap_set_node_address_watch_args 1187 + * 1188 + * Arguments for KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH 1189 + * Sets address watch for device. 1190 + * 1191 + * @address (IN) - watch address to set 1192 + * @mode (IN) - see kfd_dbg_trap_address_watch_mode 1193 + * @mask (IN) - watch address mask 1194 + * @gpu_id (IN) - target gpu to set watch point 1195 + * @id (OUT) - watch id allocated 1196 + * 1197 + * Generic errors apply (see kfd_dbg_trap_operations). 1198 + * Return - 0 on SUCCESS. 1199 + * Allocated watch ID returned to @id. 1200 + * - ENODEV if gpu_id not found. 1201 + * - ENOMEM if watch IDs can be allocated 1202 + */ 1203 + struct kfd_ioctl_dbg_trap_set_node_address_watch_args { 1204 + __u64 address; 1205 + __u32 mode; 1206 + __u32 mask; 1207 + __u32 gpu_id; 1208 + __u32 id; 1209 + }; 1210 + 1211 + /** 1212 + * kfd_ioctl_dbg_trap_clear_node_address_watch_args 1213 + * 1214 + * Arguments for KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH 1215 + * Clear address watch for device. 1216 + * 1217 + * @gpu_id (IN) - target device to clear watch point 1218 + * @id (IN) - allocated watch id to clear 1219 + * 1220 + * Generic errors apply (see kfd_dbg_trap_operations). 1221 + * Return - 0 on SUCCESS. 1222 + * - ENODEV if gpu_id not found. 1223 + * - EINVAL if watch ID has not been allocated. 1224 + */ 1225 + struct kfd_ioctl_dbg_trap_clear_node_address_watch_args { 1226 + __u32 gpu_id; 1227 + __u32 id; 1228 + }; 1229 + 1230 + /** 1231 + * kfd_ioctl_dbg_trap_set_flags_args 1232 + * 1233 + * Arguments for KFD_IOC_DBG_TRAP_SET_FLAGS 1234 + * Sets flags for wave behaviour. 1235 + * 1236 + * @flags (IN/OUT) - IN = flags to enable, OUT = flags previously enabled 1237 + * 1238 + * Generic errors apply (see kfd_dbg_trap_operations). 1239 + * Return - 0 on SUCCESS. 1240 + * - EACCESS if any debug device does not allow flag options. 1241 + */ 1242 + struct kfd_ioctl_dbg_trap_set_flags_args { 1243 + __u32 flags; 1244 + __u32 pad; 1245 + }; 1246 + 1247 + /** 1248 + * kfd_ioctl_dbg_trap_query_debug_event_args 1249 + * 1250 + * Arguments for KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT 1251 + * 1252 + * Find one or more raised exceptions. This function can return multiple 1253 + * exceptions from a single queue or a single device with one call. To find 1254 + * all raised exceptions, this function must be called repeatedly until it 1255 + * returns -EAGAIN. Returned exceptions can optionally be cleared by 1256 + * setting the corresponding bit in the @exception_mask input parameter. 1257 + * However, clearing an exception prevents retrieving further information 1258 + * about it with KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO. 1259 + * 1260 + * @exception_mask (IN/OUT) - exception to clear (IN) and raised (OUT) 1261 + * @gpu_id (OUT) - gpu id of exceptions raised 1262 + * @queue_id (OUT) - queue id of exceptions raised 1263 + * 1264 + * Generic errors apply (see kfd_dbg_trap_operations). 1265 + * Return - 0 on raised exception found 1266 + * Raised exceptions found are returned in @exception mask 1267 + * with reported source id returned in @gpu_id or @queue_id. 1268 + * - EAGAIN if no raised exception has been found 1269 + */ 1270 + struct kfd_ioctl_dbg_trap_query_debug_event_args { 1271 + __u64 exception_mask; 1272 + __u32 gpu_id; 1273 + __u32 queue_id; 1274 + }; 1275 + 1276 + /** 1277 + * kfd_ioctl_dbg_trap_query_exception_info_args 1278 + * 1279 + * Arguments KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO 1280 + * Get additional info on raised exception. 1281 + * 1282 + * @info_ptr (IN) - pointer to exception info buffer to copy to 1283 + * @info_size (IN/OUT) - exception info buffer size (bytes) 1284 + * @source_id (IN) - target gpu or queue id 1285 + * @exception_code (IN) - target exception 1286 + * @clear_exception (IN) - clear raised @exception_code exception 1287 + * (0 = false, 1 = true) 1288 + * 1289 + * Generic errors apply (see kfd_dbg_trap_operations). 1290 + * Return - 0 on SUCCESS. 1291 + * If @exception_code is EC_DEVICE_MEMORY_VIOLATION, copy @info_size(OUT) 1292 + * bytes of memory exception data to @info_ptr. 1293 + * If @exception_code is EC_PROCESS_RUNTIME, copy saved 1294 + * kfd_runtime_info to @info_ptr. 1295 + * Actual required @info_ptr size (bytes) is returned in @info_size. 1296 + */ 1297 + struct kfd_ioctl_dbg_trap_query_exception_info_args { 1298 + __u64 info_ptr; 1299 + __u32 info_size; 1300 + __u32 source_id; 1301 + __u32 exception_code; 1302 + __u32 clear_exception; 1303 + }; 1304 + 1305 + /** 1306 + * kfd_ioctl_dbg_trap_get_queue_snapshot_args 1307 + * 1308 + * Arguments KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT 1309 + * Get queue information. 1310 + * 1311 + * @exception_mask (IN) - exceptions raised to clear 1312 + * @snapshot_buf_ptr (IN) - queue snapshot entry buffer (see kfd_queue_snapshot_entry) 1313 + * @num_queues (IN/OUT) - number of queue snapshot entries 1314 + * The debugger specifies the size of the array allocated in @num_queues. 1315 + * KFD returns the number of queues that actually existed. If this is 1316 + * larger than the size specified by the debugger, KFD will not overflow 1317 + * the array allocated by the debugger. 1318 + * 1319 + * @entry_size (IN/OUT) - size per entry in bytes 1320 + * The debugger specifies sizeof(struct kfd_queue_snapshot_entry) in 1321 + * @entry_size. KFD returns the number of bytes actually populated per 1322 + * entry. The debugger should use the KFD_IOCTL_MINOR_VERSION to determine, 1323 + * which fields in struct kfd_queue_snapshot_entry are valid. This allows 1324 + * growing the ABI in a backwards compatible manner. 1325 + * Note that entry_size(IN) should still be used to stride the snapshot buffer in the 1326 + * event that it's larger than actual kfd_queue_snapshot_entry. 1327 + * 1328 + * Generic errors apply (see kfd_dbg_trap_operations). 1329 + * Return - 0 on SUCCESS. 1330 + * Copies @num_queues(IN) queue snapshot entries of size @entry_size(IN) 1331 + * into @snapshot_buf_ptr if @num_queues(IN) > 0. 1332 + * Otherwise return @num_queues(OUT) queue snapshot entries that exist. 1333 + */ 1334 + struct kfd_ioctl_dbg_trap_queue_snapshot_args { 1335 + __u64 exception_mask; 1336 + __u64 snapshot_buf_ptr; 1337 + __u32 num_queues; 1338 + __u32 entry_size; 1339 + }; 1340 + 1341 + /** 1342 + * kfd_ioctl_dbg_trap_get_device_snapshot_args 1343 + * 1344 + * Arguments for KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT 1345 + * Get device information. 1346 + * 1347 + * @exception_mask (IN) - exceptions raised to clear 1348 + * @snapshot_buf_ptr (IN) - pointer to snapshot buffer (see kfd_dbg_device_info_entry) 1349 + * @num_devices (IN/OUT) - number of debug devices to snapshot 1350 + * The debugger specifies the size of the array allocated in @num_devices. 1351 + * KFD returns the number of devices that actually existed. If this is 1352 + * larger than the size specified by the debugger, KFD will not overflow 1353 + * the array allocated by the debugger. 1354 + * 1355 + * @entry_size (IN/OUT) - size per entry in bytes 1356 + * The debugger specifies sizeof(struct kfd_dbg_device_info_entry) in 1357 + * @entry_size. KFD returns the number of bytes actually populated. The 1358 + * debugger should use KFD_IOCTL_MINOR_VERSION to determine, which fields 1359 + * in struct kfd_dbg_device_info_entry are valid. This allows growing the 1360 + * ABI in a backwards compatible manner. 1361 + * Note that entry_size(IN) should still be used to stride the snapshot buffer in the 1362 + * event that it's larger than actual kfd_dbg_device_info_entry. 1363 + * 1364 + * Generic errors apply (see kfd_dbg_trap_operations). 1365 + * Return - 0 on SUCCESS. 1366 + * Copies @num_devices(IN) device snapshot entries of size @entry_size(IN) 1367 + * into @snapshot_buf_ptr if @num_devices(IN) > 0. 1368 + * Otherwise return @num_devices(OUT) queue snapshot entries that exist. 1369 + */ 1370 + struct kfd_ioctl_dbg_trap_device_snapshot_args { 1371 + __u64 exception_mask; 1372 + __u64 snapshot_buf_ptr; 1373 + __u32 num_devices; 1374 + __u32 entry_size; 1375 + }; 1376 + 1377 + /** 1378 + * kfd_ioctl_dbg_trap_args 1379 + * 1380 + * Arguments to debug target process. 1381 + * 1382 + * @pid - target process to debug 1383 + * @op - debug operation (see kfd_dbg_trap_operations) 1384 + * 1385 + * @op determines which union struct args to use. 1386 + * Refer to kern docs for each kfd_ioctl_dbg_trap_*_args struct. 1387 + */ 1388 + struct kfd_ioctl_dbg_trap_args { 1389 + __u32 pid; 1390 + __u32 op; 1391 + 1392 + union { 1393 + struct kfd_ioctl_dbg_trap_enable_args enable; 1394 + struct kfd_ioctl_dbg_trap_send_runtime_event_args send_runtime_event; 1395 + struct kfd_ioctl_dbg_trap_set_exceptions_enabled_args set_exceptions_enabled; 1396 + struct kfd_ioctl_dbg_trap_set_wave_launch_override_args launch_override; 1397 + struct kfd_ioctl_dbg_trap_set_wave_launch_mode_args launch_mode; 1398 + struct kfd_ioctl_dbg_trap_suspend_queues_args suspend_queues; 1399 + struct kfd_ioctl_dbg_trap_resume_queues_args resume_queues; 1400 + struct kfd_ioctl_dbg_trap_set_node_address_watch_args set_node_address_watch; 1401 + struct kfd_ioctl_dbg_trap_clear_node_address_watch_args clear_node_address_watch; 1402 + struct kfd_ioctl_dbg_trap_set_flags_args set_flags; 1403 + struct kfd_ioctl_dbg_trap_query_debug_event_args query_debug_event; 1404 + struct kfd_ioctl_dbg_trap_query_exception_info_args query_exception_info; 1405 + struct kfd_ioctl_dbg_trap_queue_snapshot_args queue_snapshot; 1406 + struct kfd_ioctl_dbg_trap_device_snapshot_args device_snapshot; 1407 + }; 1408 + }; 1409 + 802 1410 #define AMDKFD_IOCTL_BASE 'K' 803 1411 #define AMDKFD_IO(nr) _IO(AMDKFD_IOCTL_BASE, nr) 804 1412 #define AMDKFD_IOR(nr, type) _IOR(AMDKFD_IOCTL_BASE, nr, type) ··· 1547 887 #define AMDKFD_IOC_EXPORT_DMABUF \ 1548 888 AMDKFD_IOWR(0x24, struct kfd_ioctl_export_dmabuf_args) 1549 889 890 + #define AMDKFD_IOC_RUNTIME_ENABLE \ 891 + AMDKFD_IOWR(0x25, struct kfd_ioctl_runtime_enable_args) 892 + 893 + #define AMDKFD_IOC_DBG_TRAP \ 894 + AMDKFD_IOWR(0x26, struct kfd_ioctl_dbg_trap_args) 895 + 1550 896 #define AMDKFD_COMMAND_START 0x01 1551 - #define AMDKFD_COMMAND_END 0x25 897 + #define AMDKFD_COMMAND_END 0x27 1552 898 1553 899 #endif