Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdgpu: add variable to record the deferred error number read by driver

Add variable to record the deferred error
number read by driver.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

YiPeng Chai and committed by
Alex Deucher
78146c1d 29b6985d

+49 -22
+45 -19
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
··· 120 120 /* typical ECC bad page rate is 1 bad page per 100MB VRAM */ 121 121 #define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL) 122 122 123 - #define MAX_UMC_POISON_POLLING_TIME_ASYNC 100 //ms 123 + #define MAX_UMC_POISON_POLLING_TIME_ASYNC 300 //ms 124 124 125 125 #define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms 126 126 ··· 2799 2799 memset(&ecc_log->ecc_key, 0xad, sizeof(ecc_log->ecc_key)); 2800 2800 2801 2801 INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL); 2802 - ecc_log->de_updated = false; 2802 + ecc_log->de_queried_count = 0; 2803 + ecc_log->prev_de_queried_count = 0; 2803 2804 } 2804 2805 2805 2806 static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log) ··· 2819 2818 mutex_unlock(&ecc_log->lock); 2820 2819 2821 2820 mutex_destroy(&ecc_log->lock); 2822 - ecc_log->de_updated = false; 2821 + ecc_log->de_queried_count = 0; 2822 + ecc_log->prev_de_queried_count = 0; 2823 2823 } 2824 2824 2825 2825 static void amdgpu_ras_do_page_retirement(struct work_struct *work) ··· 2852 2850 mutex_unlock(&con->umc_ecc_log.lock); 2853 2851 } 2854 2852 2855 - static void amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev, 2856 - uint32_t timeout_ms) 2853 + static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev, 2854 + uint32_t poison_creation_count) 2857 2855 { 2858 2856 int ret = 0; 2859 2857 struct ras_ecc_log_info *ecc_log; 2860 2858 struct ras_query_if info; 2861 - uint32_t timeout = timeout_ms; 2859 + uint32_t timeout = 0; 2862 2860 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 2861 + uint64_t de_queried_count; 2862 + uint32_t new_detect_count, total_detect_count; 2863 + uint32_t need_query_count = poison_creation_count; 2864 + bool query_data_timeout = false; 2863 2865 2864 2866 memset(&info, 0, sizeof(info)); 2865 2867 info.head.block = AMDGPU_RAS_BLOCK__UMC; 2866 2868 2867 2869 ecc_log = &ras->umc_ecc_log; 2868 - ecc_log->de_updated = false; 2870 + total_detect_count = 0; 2869 2871 do { 2870 2872 ret = amdgpu_ras_query_error_status(adev, &info); 2871 - if (ret) { 2872 - dev_err(adev->dev, "Failed to query ras error! ret:%d\n", ret); 2873 - return; 2873 + if (ret) 2874 + return ret; 2875 + 2876 + de_queried_count = ecc_log->de_queried_count; 2877 + if (de_queried_count > ecc_log->prev_de_queried_count) { 2878 + new_detect_count = de_queried_count - ecc_log->prev_de_queried_count; 2879 + ecc_log->prev_de_queried_count = de_queried_count; 2880 + timeout = 0; 2881 + } else { 2882 + new_detect_count = 0; 2874 2883 } 2875 2884 2876 - if (timeout && !ecc_log->de_updated) { 2877 - msleep(1); 2878 - timeout--; 2879 - } 2880 - } while (timeout && !ecc_log->de_updated); 2885 + if (new_detect_count) { 2886 + total_detect_count += new_detect_count; 2887 + } else { 2888 + if (!timeout && need_query_count) 2889 + timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC; 2881 2890 2882 - if (timeout_ms && !timeout) { 2883 - dev_warn(adev->dev, "Can't find deferred error\n"); 2884 - return; 2891 + if (timeout) { 2892 + if (!--timeout) { 2893 + query_data_timeout = true; 2894 + break; 2895 + } 2896 + msleep(1); 2897 + } 2898 + } 2899 + } while (total_detect_count < need_query_count); 2900 + 2901 + if (query_data_timeout) { 2902 + dev_warn(adev->dev, "Can't find deferred error! count: %u\n", 2903 + (need_query_count - total_detect_count)); 2904 + return -ENOENT; 2885 2905 } 2886 2906 2887 - if (!ret) 2907 + if (total_detect_count) 2888 2908 schedule_delayed_work(&ras->page_retirement_dwork, 0); 2909 + 2910 + return 0; 2889 2911 } 2890 2912 2891 2913 static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
+2 -1
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
··· 469 469 struct mutex lock; 470 470 siphash_key_t ecc_key; 471 471 struct radix_tree_root de_page_tree; 472 - bool de_updated; 472 + uint64_t de_queried_count; 473 + uint64_t prev_de_queried_count; 473 474 }; 474 475 475 476 struct amdgpu_ras {
+2 -2
drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
··· 557 557 ret = amdgpu_umc_logs_ecc_err(adev, &con->umc_ecc_log.de_page_tree, ecc_err); 558 558 if (ret) { 559 559 if (ret == -EEXIST) 560 - con->umc_ecc_log.de_updated = true; 560 + con->umc_ecc_log.de_queried_count++; 561 561 else 562 562 dev_err(adev->dev, "Fail to log ecc error! ret:%d\n", ret); 563 563 ··· 566 566 return ret; 567 567 } 568 568 569 - con->umc_ecc_log.de_updated = true; 569 + con->umc_ecc_log.de_queried_count++; 570 570 571 571 return 0; 572 572 }