Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdgpu: Adjust removal control flow for smu v13_0_2

Adjust removal control flow for smu v13_0_2:
During amdgpu uninstallation, when removing the first
device, the kernel needs to first send a mode1reset message
to all gpu devices. Otherwise, smu initialization will fail
the next time amdgpu is installed.

V2:
1. Update commit comments.
2. Remove the global variable amdgpu_device_remove_cnt
and add a variable to the structure amdgpu_hive_info.
3. Use hive to detect the first removed device instead of
a global variable.

V3:
1. Update commit comments.
2. Split a patch into multiple patches.
3. The current patch does:
a. Add a work mode of AMDGPU_RESET_FOR_DEVICE_REMOVE into
the existing gpu recover path, which make all devices
in hive list only have HW reset but no resume (except
the base IP).
b. Call AMDGPU_RESET_FOR_DEVICE_REMOVE and
AMDGPU_NEED_FULL_RESET mode of amdgpu_device_gpu_recover
in amdgpu_pci_remove when removing the first device in
hive list.
c. When removing the first device, the IP blocks keyword
function call sequence is as follows:
.suspend->mode1reset->.resume(basic ip)->.hw_fini->.early_fini->.sw_fini.
^ |
|-<----------<---------<----|
The first three sequences are because of a call to
amdgpu_device_gpu_recover. The three sequences will be
executed in a loop until all devices in the hive list
are iterated.
The sequences starting from .hw_fini only apply to the
first device. Since .suspend has been called before,
except the resumed phase1 basic ip blocks, all other ip
blocks .hw_fini of current device will do nothing.
d. When removing other devices, the calling sequences is the
same as legacy:
.hw_fini -> .early_fini -> .sw_fini.
Since .suspend has been called when removing the first device,
except the resumed phase1 basic ip blocks, all of other ip
blocks .hw_fini of current device will do nothing.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

YiPeng Chai and committed by
Alex Deucher
f5c7e779 e4cf73fd

+62
+30
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
··· 4749 4749 struct amdgpu_device *tmp_adev = NULL; 4750 4750 bool need_full_reset, skip_hw_reset, vram_lost = false; 4751 4751 int r = 0; 4752 + bool gpu_reset_for_dev_remove = 0; 4752 4753 4753 4754 /* Try reset handler method first */ 4754 4755 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, ··· 4768 4767 need_full_reset = 4769 4768 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4770 4769 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4770 + 4771 + gpu_reset_for_dev_remove = 4772 + test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 4773 + test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4771 4774 4772 4775 /* 4773 4776 * ASIC reset has to be done on all XGMI hive nodes ASAP ··· 4815 4810 } 4816 4811 4817 4812 amdgpu_ras_intr_cleared(); 4813 + } 4814 + 4815 + /* Since the mode1 reset affects base ip blocks, the 4816 + * phase1 ip blocks need to be resumed. Otherwise there 4817 + * will be a BIOS signature error and the psp bootloader 4818 + * can't load kdb on the next amdgpu install. 4819 + */ 4820 + if (gpu_reset_for_dev_remove) { 4821 + list_for_each_entry(tmp_adev, device_list_handle, reset_list) 4822 + amdgpu_device_ip_resume_phase1(tmp_adev); 4823 + 4824 + goto end; 4818 4825 } 4819 4826 4820 4827 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { ··· 5151 5134 bool need_emergency_restart = false; 5152 5135 bool audio_suspended = false; 5153 5136 int tmp_vram_lost_counter; 5137 + bool gpu_reset_for_dev_remove = false; 5138 + 5139 + gpu_reset_for_dev_remove = 5140 + test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5141 + test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5154 5142 5155 5143 /* 5156 5144 * Special case: RAS triggered and full reset isn't supported ··· 5275 5253 5276 5254 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5277 5255 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5256 + if (gpu_reset_for_dev_remove) { 5257 + /* Workaroud for ASICs need to disable SMC first */ 5258 + amdgpu_device_smu_fini_early(tmp_adev); 5259 + } 5278 5260 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5279 5261 /*TODO Should we stop ?*/ 5280 5262 if (r) { ··· 5312 5286 adev->asic_reset_res = 0; 5313 5287 goto retry; 5314 5288 } 5289 + 5290 + if (!r && gpu_reset_for_dev_remove) 5291 + goto recover_end; 5315 5292 } 5316 5293 5317 5294 skip_hw_reset: ··· 5388 5359 amdgpu_device_unset_mp1_state(tmp_adev); 5389 5360 } 5390 5361 5362 + recover_end: 5391 5363 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5392 5364 reset_list); 5393 5365 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
+30
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
··· 2186 2186 pm_runtime_forbid(dev->dev); 2187 2187 } 2188 2188 2189 + if (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2)) { 2190 + bool need_to_reset_gpu = false; 2191 + 2192 + if (adev->gmc.xgmi.num_physical_nodes > 1) { 2193 + struct amdgpu_hive_info *hive; 2194 + 2195 + hive = amdgpu_get_xgmi_hive(adev); 2196 + if (hive->device_remove_count == 0) 2197 + need_to_reset_gpu = true; 2198 + hive->device_remove_count++; 2199 + amdgpu_put_xgmi_hive(hive); 2200 + } else { 2201 + need_to_reset_gpu = true; 2202 + } 2203 + 2204 + /* Workaround for ASICs need to reset SMU. 2205 + * Called only when the first device is removed. 2206 + */ 2207 + if (need_to_reset_gpu) { 2208 + struct amdgpu_reset_context reset_context; 2209 + 2210 + memset(&reset_context, 0, sizeof(reset_context)); 2211 + reset_context.method = AMD_RESET_METHOD_NONE; 2212 + reset_context.reset_req_dev = adev; 2213 + set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 2214 + set_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context.flags); 2215 + amdgpu_device_gpu_recover(adev, NULL, &reset_context); 2216 + } 2217 + } 2218 + 2189 2219 amdgpu_driver_unload_kms(dev); 2190 2220 2191 2221 drm_dev_unplug(dev);
+1
drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
··· 31 31 AMDGPU_NEED_FULL_RESET = 0, 32 32 AMDGPU_SKIP_HW_RESET = 1, 33 33 AMDGPU_SKIP_MODE2_RESET = 2, 34 + AMDGPU_RESET_FOR_DEVICE_REMOVE = 3, 34 35 }; 35 36 36 37 struct amdgpu_reset_context {
+1
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
··· 43 43 } pstate; 44 44 45 45 struct amdgpu_reset_domain *reset_domain; 46 + uint32_t device_remove_count; 46 47 }; 47 48 48 49 struct amdgpu_pcs_ras_field {