Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdgpu: suspend ras module before gpu reset

During gpu reset, all GPU-related resources are
inaccessible. To avoid affecting ras functionality,
suspend ras module before gpu reset and resume
it after gpu reset is complete.

V2:
Rename functions to avoid misunderstanding.

V3:
Move flush_delayed_work to amdgpu_ras_process_pause,
Move schedule_delayed_work to amdgpu_ras_process_unpause.

V4:
Rename functions.

V5:
Move the function to amdgpu_ras.c.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Acked-by: Lijo Lazar <lijo.lazar@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

YiPeng Chai and committed by
Alex Deucher
d95ca7f5 d4432f16

+148 -2
+5
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
··· 71 71 72 72 #include "amdgpu_xgmi.h" 73 73 #include "amdgpu_ras.h" 74 + #include "amdgpu_ras_mgr.h" 74 75 #include "amdgpu_pmu.h" 75 76 #include "amdgpu_fru_eeprom.h" 76 77 #include "amdgpu_reset.h" ··· 6661 6660 goto end_reset; 6662 6661 } 6663 6662 6663 + /* Cannot be called after locking reset domain */ 6664 + amdgpu_ras_pre_reset(adev, &device_list); 6665 + 6664 6666 /* We need to lock reset domain only once both for XGMI and single device */ 6665 6667 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 6666 6668 ··· 6695 6691 reset_unlock: 6696 6692 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 6697 6693 end_reset: 6694 + amdgpu_ras_post_reset(adev, &device_list); 6698 6695 if (hive) { 6699 6696 mutex_unlock(&hive->hive_lock); 6700 6697 amdgpu_put_xgmi_hive(hive);
+28 -2
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
··· 2921 2921 type = amdgpu_ras_get_fatal_error_event(adev); 2922 2922 list_for_each_entry(remote_adev, 2923 2923 device_list_handle, gmc.xgmi.head) { 2924 - amdgpu_ras_query_err_status(remote_adev); 2925 - amdgpu_ras_log_on_err_counter(remote_adev, type); 2924 + if (amdgpu_uniras_enabled(remote_adev)) { 2925 + amdgpu_ras_mgr_update_ras_ecc(remote_adev); 2926 + } else { 2927 + amdgpu_ras_query_err_status(remote_adev); 2928 + amdgpu_ras_log_on_err_counter(remote_adev, type); 2929 + } 2926 2930 } 2927 2931 2928 2932 } ··· 5676 5672 mutex_unlock(&con->critical_region_lock); 5677 5673 5678 5674 return ret; 5675 + } 5676 + 5677 + void amdgpu_ras_pre_reset(struct amdgpu_device *adev, 5678 + struct list_head *device_list) 5679 + { 5680 + struct amdgpu_device *tmp_adev = NULL; 5681 + 5682 + list_for_each_entry(tmp_adev, device_list, reset_list) { 5683 + if (amdgpu_uniras_enabled(tmp_adev)) 5684 + amdgpu_ras_mgr_pre_reset(tmp_adev); 5685 + } 5686 + } 5687 + 5688 + void amdgpu_ras_post_reset(struct amdgpu_device *adev, 5689 + struct list_head *device_list) 5690 + { 5691 + struct amdgpu_device *tmp_adev = NULL; 5692 + 5693 + list_for_each_entry(tmp_adev, device_list, reset_list) { 5694 + if (amdgpu_uniras_enabled(tmp_adev)) 5695 + amdgpu_ras_mgr_post_reset(tmp_adev); 5696 + } 5679 5697 }
+5
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
··· 1039 1039 const char *fmt, ...); 1040 1040 1041 1041 bool amdgpu_ras_is_rma(struct amdgpu_device *adev); 1042 + 1043 + void amdgpu_ras_pre_reset(struct amdgpu_device *adev, 1044 + struct list_head *device_list); 1045 + void amdgpu_ras_post_reset(struct amdgpu_device *adev, 1046 + struct list_head *device_list); 1042 1047 #endif
+22
drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c
··· 624 624 625 625 return ret; 626 626 } 627 + 628 + int amdgpu_ras_mgr_pre_reset(struct amdgpu_device *adev) 629 + { 630 + if (!amdgpu_ras_mgr_is_ready(adev)) { 631 + RAS_DEV_ERR(adev, "Invalid ras suspend!\n"); 632 + return -EPERM; 633 + } 634 + 635 + amdgpu_ras_process_pre_reset(adev); 636 + return 0; 637 + } 638 + 639 + int amdgpu_ras_mgr_post_reset(struct amdgpu_device *adev) 640 + { 641 + if (!amdgpu_ras_mgr_is_ready(adev)) { 642 + RAS_DEV_ERR(adev, "Invalid ras resume!\n"); 643 + return -EPERM; 644 + } 645 + 646 + amdgpu_ras_process_post_reset(adev); 647 + return 0; 648 + }
+5
drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.h
··· 52 52 struct ras_event_manager ras_event_mgr; 53 53 uint64_t last_poison_consumption_seqno; 54 54 bool ras_is_ready; 55 + 56 + bool is_paused; 57 + struct completion ras_event_done; 55 58 }; 56 59 57 60 extern const struct amdgpu_ip_block_version ras_v1_0_ip_block; ··· 78 75 int amdgpu_ras_mgr_handle_ras_cmd(struct amdgpu_device *adev, 79 76 uint32_t cmd_id, void *input, uint32_t input_size, 80 77 void *output, uint32_t out_size); 78 + int amdgpu_ras_mgr_pre_reset(struct amdgpu_device *adev); 79 + int amdgpu_ras_mgr_post_reset(struct amdgpu_device *adev); 81 80 #endif
+64
drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.c
··· 29 29 #include "amdgpu_ras_process.h" 30 30 31 31 #define RAS_MGR_RETIRE_PAGE_INTERVAL 100 32 + #define RAS_EVENT_PROCESS_TIMEOUT 1200 32 33 33 34 static void ras_process_retire_page_dwork(struct work_struct *work) 34 35 { ··· 58 57 { 59 58 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); 60 59 60 + ras_mgr->is_paused = false; 61 + init_completion(&ras_mgr->ras_event_done); 62 + 61 63 INIT_DELAYED_WORK(&ras_mgr->retire_page_dwork, ras_process_retire_page_dwork); 62 64 63 65 return 0; ··· 70 66 { 71 67 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); 72 68 69 + ras_mgr->is_paused = false; 73 70 /* Save all cached bad pages to eeprom */ 74 71 flush_delayed_work(&ras_mgr->retire_page_dwork); 75 72 cancel_delayed_work_sync(&ras_mgr->retire_page_dwork); ··· 128 123 req.seqno = seqno; 129 124 130 125 return ras_process_add_interrupt_req(ras_mgr->ras_core, &req, false); 126 + } 127 + 128 + int amdgpu_ras_process_begin(struct amdgpu_device *adev) 129 + { 130 + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); 131 + 132 + if (ras_mgr->is_paused) 133 + return -EAGAIN; 134 + 135 + reinit_completion(&ras_mgr->ras_event_done); 136 + return 0; 137 + } 138 + 139 + int amdgpu_ras_process_end(struct amdgpu_device *adev) 140 + { 141 + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); 142 + 143 + complete(&ras_mgr->ras_event_done); 144 + return 0; 145 + } 146 + 147 + int amdgpu_ras_process_pre_reset(struct amdgpu_device *adev) 148 + { 149 + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); 150 + long rc; 151 + 152 + if (!ras_mgr || !ras_mgr->ras_core) 153 + return -EINVAL; 154 + 155 + if (!ras_mgr->ras_core->is_initialized) 156 + return -EPERM; 157 + 158 + ras_mgr->is_paused = true; 159 + 160 + /* Wait for RAS event processing to complete */ 161 + rc = wait_for_completion_interruptible_timeout(&ras_mgr->ras_event_done, 162 + msecs_to_jiffies(RAS_EVENT_PROCESS_TIMEOUT)); 163 + if (rc <= 0) 164 + RAS_DEV_WARN(adev, "Waiting for ras process to complete %s\n", 165 + rc ? "interrupted" : "timeout"); 166 + 167 + flush_delayed_work(&ras_mgr->retire_page_dwork); 168 + return 0; 169 + } 170 + 171 + int amdgpu_ras_process_post_reset(struct amdgpu_device *adev) 172 + { 173 + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); 174 + 175 + if (!ras_mgr || !ras_mgr->ras_core) 176 + return -EINVAL; 177 + 178 + if (!ras_mgr->ras_core->is_initialized) 179 + return -EPERM; 180 + 181 + ras_mgr->is_paused = false; 182 + 183 + schedule_delayed_work(&ras_mgr->retire_page_dwork, 0); 184 + return 0; 131 185 }
+4
drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.h
··· 34 34 void *data); 35 35 int amdgpu_ras_process_handle_consumption_interrupt(struct amdgpu_device *adev, 36 36 void *data); 37 + int amdgpu_ras_process_begin(struct amdgpu_device *adev); 38 + int amdgpu_ras_process_end(struct amdgpu_device *adev); 39 + int amdgpu_ras_process_pre_reset(struct amdgpu_device *adev); 40 + int amdgpu_ras_process_post_reset(struct amdgpu_device *adev); 37 41 #endif
+6
drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c
··· 142 142 case RAS_EVENT_ID__RESET_GPU: 143 143 ret = amdgpu_ras_mgr_reset_gpu(ras_core->dev, *(uint32_t *)data); 144 144 break; 145 + case RAS_EVENT_ID__RAS_EVENT_PROC_BEGIN: 146 + ret = amdgpu_ras_process_begin(ras_core->dev); 147 + break; 148 + case RAS_EVENT_ID__RAS_EVENT_PROC_END: 149 + ret = amdgpu_ras_process_end(ras_core->dev); 150 + break; 145 151 default: 146 152 RAS_DEV_WARN(ras_core->dev, "Invalid ras notify event:%d\n", event_id); 147 153 break;
+2
drivers/gpu/drm/amd/ras/rascore/ras.h
··· 115 115 RAS_EVENT_ID__FATAL_ERROR_DETECTED, 116 116 RAS_EVENT_ID__RESET_GPU, 117 117 RAS_EVENT_ID__RESET_VF, 118 + RAS_EVENT_ID__RAS_EVENT_PROC_BEGIN, 119 + RAS_EVENT_ID__RAS_EVENT_PROC_END, 118 120 }; 119 121 120 122 enum ras_gpu_status {
+7
drivers/gpu/drm/amd/ras/rascore/ras_process.c
··· 162 162 uint32_t umc_event_count; 163 163 int ret; 164 164 165 + ret = ras_core_event_notify(ras_core, 166 + RAS_EVENT_ID__RAS_EVENT_PROC_BEGIN, NULL); 167 + if (ret) 168 + return ret; 169 + 165 170 ras_aca_clear_fatal_flag(ras_core); 166 171 ras_umc_log_pending_bad_bank(ras_core); 167 172 ··· 190 185 atomic_set(&ras_proc->umc_interrupt_count, 0); 191 186 } 192 187 188 + ras_core_event_notify(ras_core, 189 + RAS_EVENT_ID__RAS_EVENT_PROC_END, NULL); 193 190 return ret; 194 191 } 195 192