Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdkfd: Skip packet submission on fatal error

If fatal error is detected, packet submission won't go through. Return
error in such cases. Also, avoid waiting for fence when fatal error is
detected.

Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
Reviewed-by: Asad Kamal <asad.kamal@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Lijo Lazar and committed by
Alex Deucher
e1f6746f 1b6ef74b

+23 -7
+5
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
··· 742 742 amdgpu_device_flush_hdp(adev, NULL); 743 743 } 744 744 745 + bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev) 746 + { 747 + return amdgpu_ras_get_fed_status(adev); 748 + } 749 + 745 750 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, 746 751 enum amdgpu_ras_block block, bool reset) 747 752 {
+1
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
··· 337 337 struct tile_config *config); 338 338 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, 339 339 enum amdgpu_ras_block block, bool reset); 340 + bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev); 340 341 bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem); 341 342 void amdgpu_amdkfd_block_mmu_notifications(void *p); 342 343 int amdgpu_amdkfd_criu_resume(void *p);
+4
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
··· 1903 1903 uint64_t *fence_addr = dqm->fence_addr; 1904 1904 1905 1905 while (*fence_addr != fence_value) { 1906 + /* Fatal err detected, this response won't come */ 1907 + if (amdgpu_amdkfd_is_fed(dqm->dev->adev)) 1908 + return -EIO; 1909 + 1906 1910 if (time_after(jiffies, end_jiffies)) { 1907 1911 dev_err(dev, "qcm fence wait loop timeout expired\n"); 1908 1912 /* In HWS case, this is used to halt the driver thread
+7 -1
drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
··· 286 286 return -ENOMEM; 287 287 } 288 288 289 - void kq_submit_packet(struct kernel_queue *kq) 289 + int kq_submit_packet(struct kernel_queue *kq) 290 290 { 291 291 #ifdef DEBUG 292 292 int i; ··· 298 298 } 299 299 pr_debug("\n"); 300 300 #endif 301 + /* Fatal err detected, packet submission won't go through */ 302 + if (amdgpu_amdkfd_is_fed(kq->dev->adev)) 303 + return -EIO; 304 + 301 305 if (kq->dev->kfd->device_info.doorbell_size == 8) { 302 306 *kq->wptr64_kernel = kq->pending_wptr64; 303 307 write_kernel_doorbell64(kq->queue->properties.doorbell_ptr, ··· 311 307 write_kernel_doorbell(kq->queue->properties.doorbell_ptr, 312 308 kq->pending_wptr); 313 309 } 310 + 311 + return 0; 314 312 } 315 313 316 314 void kq_rollback_packet(struct kernel_queue *kq)
+1 -1
drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h
··· 47 47 int kq_acquire_packet_buffer(struct kernel_queue *kq, 48 48 size_t packet_size_in_dwords, 49 49 unsigned int **buffer_ptr); 50 - void kq_submit_packet(struct kernel_queue *kq); 50 + int kq_submit_packet(struct kernel_queue *kq); 51 51 void kq_rollback_packet(struct kernel_queue *kq); 52 52 53 53
+5 -5
drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
··· 288 288 289 289 retval = pm->pmf->set_resources(pm, buffer, res); 290 290 if (!retval) 291 - kq_submit_packet(pm->priv_queue); 291 + retval = kq_submit_packet(pm->priv_queue); 292 292 else 293 293 kq_rollback_packet(pm->priv_queue); 294 294 ··· 325 325 if (retval) 326 326 goto fail_create_runlist; 327 327 328 - kq_submit_packet(pm->priv_queue); 328 + retval = kq_submit_packet(pm->priv_queue); 329 329 330 330 mutex_unlock(&pm->lock); 331 331 ··· 361 361 362 362 retval = pm->pmf->query_status(pm, buffer, fence_address, fence_value); 363 363 if (!retval) 364 - kq_submit_packet(pm->priv_queue); 364 + retval = kq_submit_packet(pm->priv_queue); 365 365 else 366 366 kq_rollback_packet(pm->priv_queue); 367 367 ··· 392 392 393 393 retval = pm->pmf->set_grace_period(pm, buffer, grace_period); 394 394 if (!retval) 395 - kq_submit_packet(pm->priv_queue); 395 + retval = kq_submit_packet(pm->priv_queue); 396 396 else 397 397 kq_rollback_packet(pm->priv_queue); 398 398 } ··· 421 421 422 422 retval = pm->pmf->unmap_queues(pm, buffer, filter, filter_param, reset); 423 423 if (!retval) 424 - kq_submit_packet(pm->priv_queue); 424 + retval = kq_submit_packet(pm->priv_queue); 425 425 else 426 426 kq_rollback_packet(pm->priv_queue); 427 427