drm/amdkfd: Update logic for CU occupancy calculations

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Currently, the code uses the IH_VMID_X_LUT register to map
a queue's vmid to the corresponding PASID. This logic is racy
since CP can update the VMID-PASID mapping anytime especially
when there are more processes than number of vmids. Update the
logic to calculate CU occupancy by matching doorbell offset of
the queue with valid wave counts against the process's queues.

Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
Reviewed-by: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Mukul Joshi and committed by

Alex Deucher 2 years ago 6ae9e1ab e1d27f7a

+87 -63

6 changed files

expand all

drivers

gpu

drm

amd

amdgpu

amdgpu_amdkfd_gfx_v9.c

amdgpu_amdkfd_gfx_v9.h

amdkfd

kfd_device_queue_manager.c

kfd_device_queue_manager.h

kfd_process.c

include

kgd_kfd_interface.h

+40 -58

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c

··· 950 950 * @inst: xcc's instance number on a multi-XCC setup 951 951 */ 952 952 static void get_wave_count(struct amdgpu_device *adev, int queue_idx, 953 - int *wave_cnt, int *vmid, uint32_t inst) 953 + struct kfd_cu_occupancy *queue_cnt, uint32_t inst) 954 954 { 955 955 int pipe_idx; 956 956 int queue_slot; 957 957 unsigned int reg_val; 958 - 958 + unsigned int wave_cnt; 959 959 /* 960 960 * Program GRBM with appropriate MEID, PIPEID, QUEUEID and VMID 961 961 * parameters to read out waves in flight. Get VMID if there are 962 962 * non-zero waves in flight. 963 963 */ 964 - *vmid = 0xFF; 965 - *wave_cnt = 0; 966 964 pipe_idx = queue_idx / adev->gfx.mec.num_queue_per_pipe; 967 965 queue_slot = queue_idx % adev->gfx.mec.num_queue_per_pipe; 968 966 soc15_grbm_select(adev, 1, pipe_idx, queue_slot, 0, inst); 969 - reg_val = RREG32_SOC15_IP(GC, SOC15_REG_OFFSET(GC, inst, mmSPI_CSQ_WF_ACTIVE_COUNT_0) + 970 - queue_slot); 971 - *wave_cnt = reg_val & SPI_CSQ_WF_ACTIVE_COUNT_0__COUNT_MASK; 972 - if (*wave_cnt != 0) 973 - *vmid = (RREG32_SOC15(GC, inst, mmCP_HQD_VMID) & 974 - CP_HQD_VMID__VMID_MASK) >> CP_HQD_VMID__VMID__SHIFT; 967 + reg_val = RREG32_SOC15_IP(GC, SOC15_REG_OFFSET(GC, inst, 968 + mmSPI_CSQ_WF_ACTIVE_COUNT_0) + queue_slot); 969 + wave_cnt = reg_val & SPI_CSQ_WF_ACTIVE_COUNT_0__COUNT_MASK; 970 + if (wave_cnt != 0) { 971 + queue_cnt->wave_cnt += wave_cnt; 972 + queue_cnt->doorbell_off = 973 + (RREG32_SOC15(GC, inst, mmCP_HQD_PQ_DOORBELL_CONTROL) & 974 + CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET_MASK) >> 975 + CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT; 976 + } 975 977 } 976 978 977 979 /** ··· 983 981 * or more queues running and submitting waves to compute units. 984 982 * 985 983 * @adev: Handle of device from which to get number of waves in flight 986 - * @pasid: Identifies the process for which this query call is invoked 987 - * @pasid_wave_cnt: Output parameter updated with number of waves in flight that 988 - * belong to process with given pasid 984 + * @cu_occupancy: Array that gets filled with wave_cnt and doorbell offset 985 + * for comparison later. 989 986 * @max_waves_per_cu: Output parameter updated with maximum number of waves 990 987 * possible per Compute Unit 991 988 * @inst: xcc's instance number on a multi-XCC setup ··· 1012 1011 * number of waves that are in flight for the queue at specified index. The 1013 1012 * index ranges from 0 to 7. 1014 1013 * 1015 - * If non-zero waves are in flight, read CP_HQD_VMID register to obtain VMID 1016 - * of the wave(s). 1014 + * If non-zero waves are in flight, store the corresponding doorbell offset 1015 + * of the queue, along with the wave count. 1017 1016 * 1018 - * Determine if VMID from above step maps to pasid provided as parameter. If 1019 - * it matches agrregate the wave count. That the VMID will not match pasid is 1020 - * a normal condition i.e. a device is expected to support multiple queues 1021 - * from multiple proceses. 1017 + * Determine if the queue belongs to the process by comparing the doorbell 1018 + * offset against the process's queues. If it matches, aggregate the wave 1019 + * count for the process. 1022 1020 * 1023 1021 * Reading registers referenced above involves programming GRBM appropriately 1024 1022 */ 1025 - void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid, 1026 - int *pasid_wave_cnt, int *max_waves_per_cu, uint32_t inst) 1023 + void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, 1024 + struct kfd_cu_occupancy *cu_occupancy, 1025 + int *max_waves_per_cu, uint32_t inst) 1027 1026 { 1028 1027 int qidx; 1029 - int vmid; 1030 1028 int se_idx; 1031 - int sh_idx; 1032 1029 int se_cnt; 1033 - int sh_cnt; 1034 - int wave_cnt; 1035 1030 int queue_map; 1036 - int pasid_tmp; 1037 1031 int max_queue_cnt; 1038 - int vmid_wave_cnt = 0; 1039 1032 DECLARE_BITMAP(cp_queue_bitmap, AMDGPU_MAX_QUEUES); 1040 1033 1041 1034 lock_spi_csq_mutexes(adev); ··· 1043 1048 AMDGPU_MAX_QUEUES); 1044 1049 max_queue_cnt = adev->gfx.mec.num_pipe_per_mec * 1045 1050 adev->gfx.mec.num_queue_per_pipe; 1046 - sh_cnt = adev->gfx.config.max_sh_per_se; 1047 1051 se_cnt = adev->gfx.config.max_shader_engines; 1048 1052 for (se_idx = 0; se_idx < se_cnt; se_idx++) { 1049 - for (sh_idx = 0; sh_idx < sh_cnt; sh_idx++) { 1053 + amdgpu_gfx_select_se_sh(adev, se_idx, 0, 0xffffffff, inst); 1054 + queue_map = RREG32_SOC15(GC, inst, mmSPI_CSQ_WF_ACTIVE_STATUS); 1050 1055 1051 - amdgpu_gfx_select_se_sh(adev, se_idx, sh_idx, 0xffffffff, inst); 1052 - queue_map = RREG32_SOC15(GC, inst, mmSPI_CSQ_WF_ACTIVE_STATUS); 1053 - 1054 - /* 1055 - * Assumption: queue map encodes following schema: four 1056 - * pipes per each micro-engine, with each pipe mapping 1057 - * eight queues. This schema is true for GFX9 devices 1058 - * and must be verified for newer device families 1056 + /* 1057 + * Assumption: queue map encodes following schema: four 1058 + * pipes per each micro-engine, with each pipe mapping 1059 + * eight queues. This schema is true for GFX9 devices 1060 + * and must be verified for newer device families 1061 + */ 1062 + for (qidx = 0; qidx < max_queue_cnt; qidx++) { 1063 + /* Skip qeueus that are not associated with 1064 + * compute functions 1059 1065 */ 1060 - for (qidx = 0; qidx < max_queue_cnt; qidx++) { 1066 + if (!test_bit(qidx, cp_queue_bitmap)) 1067 + continue; 1061 1068 1062 - /* Skip qeueus that are not associated with 1063 - * compute functions 1064 - */ 1065 - if (!test_bit(qidx, cp_queue_bitmap)) 1066 - continue; 1069 + if (!(queue_map & (1 << qidx))) 1070 + continue; 1067 1071 1068 - if (!(queue_map & (1 << qidx))) 1069 - continue; 1070 - 1071 - /* Get number of waves in flight and aggregate them */ 1072 - get_wave_count(adev, qidx, &wave_cnt, &vmid, 1073 - inst); 1074 - if (wave_cnt != 0) { 1075 - pasid_tmp = 1076 - RREG32(SOC15_REG_OFFSET(OSSSYS, inst, 1077 - mmIH_VMID_0_LUT) + vmid); 1078 - if (pasid_tmp == pasid) 1079 - vmid_wave_cnt += wave_cnt; 1080 - } 1081 - } 1072 + /* Get number of waves in flight and aggregate them */ 1073 + get_wave_count(adev, qidx, &cu_occupancy[qidx], 1074 + inst); 1082 1075 } 1083 1076 } 1084 1077 ··· 1075 1092 unlock_spi_csq_mutexes(adev); 1076 1093 1077 1094 /* Update the output parameters and return */ 1078 - *pasid_wave_cnt = vmid_wave_cnt; 1079 1095 *max_waves_per_cu = adev->gfx.cu_info.simd_per_cu * 1080 1096 adev->gfx.cu_info.max_waves_per_simd; 1081 1097 }

+3 -2

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h

··· 52 52 uint8_t vmid, uint16_t *p_pasid); 53 53 void kgd_gfx_v9_set_vm_context_page_table_base(struct amdgpu_device *adev, 54 54 uint32_t vmid, uint64_t page_table_base); 55 - void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid, 56 - int *pasid_wave_cnt, int *max_waves_per_cu, uint32_t inst); 55 + void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, 56 + struct kfd_cu_occupancy *cu_occupancy, 57 + int *max_waves_per_cu, uint32_t inst); 57 58 void kgd_gfx_v9_program_trap_handler_settings(struct amdgpu_device *adev, 58 59 uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr, 59 60 uint32_t inst);

+20

drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c

··· 3540 3540 return debug_map_and_unlock(dqm); 3541 3541 } 3542 3542 3543 + bool kfd_dqm_is_queue_in_process(struct device_queue_manager *dqm, 3544 + struct qcm_process_device *qpd, 3545 + int doorbell_off) 3546 + { 3547 + struct queue *q; 3548 + bool r = false; 3549 + 3550 + dqm_lock(dqm); 3551 + 3552 + list_for_each_entry(q, &qpd->queues_list, list) { 3553 + if (q->properties.doorbell_off == doorbell_off) { 3554 + r = true; 3555 + goto out; 3556 + } 3557 + } 3558 + 3559 + out: 3560 + dqm_unlock(dqm); 3561 + return r; 3562 + } 3543 3563 #if defined(CONFIG_DEBUG_FS) 3544 3564 3545 3565 static void seq_reg_dump(struct seq_file *m,

drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h

··· 324 324 int debug_lock_and_unmap(struct device_queue_manager *dqm); 325 325 int debug_map_and_unlock(struct device_queue_manager *dqm); 326 326 int debug_refresh_runlist(struct device_queue_manager *dqm); 327 + bool kfd_dqm_is_queue_in_process(struct device_queue_manager *dqm, 328 + struct qcm_process_device *qpd, 329 + int doorbell_off); 327 330 328 331 static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd) 329 332 {

+13 -1

drivers/gpu/drm/amd/amdkfd/kfd_process.c

··· 270 270 struct kfd_node *dev = NULL; 271 271 struct kfd_process *proc = NULL; 272 272 struct kfd_process_device *pdd = NULL; 273 + int i; 274 + struct kfd_cu_occupancy cu_occupancy[AMDGPU_MAX_QUEUES]; 275 + 276 + memset(cu_occupancy, 0x0, sizeof(cu_occupancy)); 273 277 274 278 pdd = container_of(attr, struct kfd_process_device, attr_cu_occupancy); 275 279 dev = pdd->dev; ··· 291 287 /* Collect wave count from device if it supports */ 292 288 wave_cnt = 0; 293 289 max_waves_per_cu = 0; 294 - dev->kfd2kgd->get_cu_occupancy(dev->adev, proc->pasid, &wave_cnt, 290 + 291 + dev->kfd2kgd->get_cu_occupancy(dev->adev, cu_occupancy, 295 292 &max_waves_per_cu, 0); 293 + 294 + for (i = 0; i < AMDGPU_MAX_QUEUES; i++) { 295 + if (cu_occupancy[i].wave_cnt != 0 && 296 + kfd_dqm_is_queue_in_process(dev->dqm, &pdd->qpd, 297 + cu_occupancy[i].doorbell_off)) 298 + wave_cnt += cu_occupancy[i].wave_cnt; 299 + } 296 300 297 301 /* Translate wave count to number of compute units */ 298 302 cu_cnt = (wave_cnt + (max_waves_per_cu - 1)) / max_waves_per_cu;

+8 -2

drivers/gpu/drm/amd/include/kgd_kfd_interface.h

··· 71 71 KGD_POOL_FRAMEBUFFER = 3, 72 72 }; 73 73 74 + struct kfd_cu_occupancy { 75 + u32 wave_cnt; 76 + u32 doorbell_off; 77 + }; 78 + 74 79 /** 75 80 * enum kfd_sched_policy 76 81 * ··· 318 313 uint32_t grace_period, 319 314 uint32_t *reg_offset, 320 315 uint32_t *reg_data); 321 - void (*get_cu_occupancy)(struct amdgpu_device *adev, int pasid, 322 - int *wave_cnt, int *max_waves_per_cu, uint32_t inst); 316 + void (*get_cu_occupancy)(struct amdgpu_device *adev, 317 + struct kfd_cu_occupancy *cu_occupancy, 318 + int *max_waves_per_cu, uint32_t inst); 323 319 void (*program_trap_handler_settings)(struct amdgpu_device *adev, 324 320 uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr, 325 321 uint32_t inst);