Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdgpu: race issue when jobs on 2 ring timeout

Fix a racing issue when jobs on 2 rings timeout simultaneously.

If 2 rings timed out at the same time, the
amdgpu_device_gpu_recover will be reentered. Then the
adev->gmc.xgmi.head will be grabbed by 2 local linked list,
which may cause wild pointer issue in iterating.

lock the device earily to prevent the node be added to 2
different lists.

also increase karma for the skipped job since the job is also
timed out and should be guilty.

Signed-off-by: Horace Chen <horace.chen@amd.com>
Reviewed-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Horace Chen and committed by
Alex Deucher
91fb309d eda1068d

+59 -10
+59 -10
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
··· 4461 4461 up_write(&adev->reset_sem); 4462 4462 } 4463 4463 4464 + /* 4465 + * to lockup a list of amdgpu devices in a hive safely, if not a hive 4466 + * with multiple nodes, it will be similar as amdgpu_device_lock_adev. 4467 + * 4468 + * unlock won't require roll back. 4469 + */ 4470 + static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive) 4471 + { 4472 + struct amdgpu_device *tmp_adev = NULL; 4473 + 4474 + if (adev->gmc.xgmi.num_physical_nodes > 1) { 4475 + if (!hive) { 4476 + dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes"); 4477 + return -ENODEV; 4478 + } 4479 + list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 4480 + if (!amdgpu_device_lock_adev(tmp_adev, hive)) 4481 + goto roll_back; 4482 + } 4483 + } else if (!amdgpu_device_lock_adev(adev, hive)) 4484 + return -EAGAIN; 4485 + 4486 + return 0; 4487 + roll_back: 4488 + if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) { 4489 + /* 4490 + * if the lockup iteration break in the middle of a hive, 4491 + * it may means there may has a race issue, 4492 + * or a hive device locked up independently. 4493 + * we may be in trouble and may not, so will try to roll back 4494 + * the lock and give out a warnning. 4495 + */ 4496 + dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock"); 4497 + list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) { 4498 + amdgpu_device_unlock_adev(tmp_adev); 4499 + } 4500 + } 4501 + return -EAGAIN; 4502 + } 4503 + 4464 4504 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 4465 4505 { 4466 4506 struct pci_dev *p = NULL; ··· 4614 4574 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", 4615 4575 job ? job->base.id : -1, hive->hive_id); 4616 4576 amdgpu_put_xgmi_hive(hive); 4577 + if (job) 4578 + drm_sched_increase_karma(&job->base); 4617 4579 return 0; 4618 4580 } 4619 4581 mutex_lock(&hive->hive_lock); 4582 + } 4583 + 4584 + /* 4585 + * lock the device before we try to operate the linked list 4586 + * if didn't get the device lock, don't touch the linked list since 4587 + * others may iterating it. 4588 + */ 4589 + r = amdgpu_device_lock_hive_adev(adev, hive); 4590 + if (r) { 4591 + dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress", 4592 + job ? job->base.id : -1); 4593 + 4594 + /* even we skipped this reset, still need to set the job to guilty */ 4595 + if (job) 4596 + drm_sched_increase_karma(&job->base); 4597 + goto skip_recovery; 4620 4598 } 4621 4599 4622 4600 /* ··· 4644 4586 */ 4645 4587 INIT_LIST_HEAD(&device_list); 4646 4588 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4647 - if (!hive) 4648 - return -ENODEV; 4649 4589 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list)) 4650 4590 list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list); 4651 4591 device_list_handle = &hive->device_list; ··· 4654 4598 4655 4599 /* block all schedulers and reset given job's ring */ 4656 4600 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4657 - if (!amdgpu_device_lock_adev(tmp_adev, hive)) { 4658 - dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress", 4659 - job ? job->base.id : -1); 4660 - r = 0; 4661 - goto skip_recovery; 4662 - } 4663 - 4664 4601 /* 4665 4602 * Try to put the audio codec into suspend state 4666 4603 * before gpu reset started. ··· 4791 4742 amdgpu_put_xgmi_hive(hive); 4792 4743 } 4793 4744 4794 - if (r) 4745 + if (r && r != -EAGAIN) 4795 4746 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 4796 4747 return r; 4797 4748 }