drm/msm: Hangcheck progress detection · tjh.dev/kernel@d73b1d0

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

drm/msm: Hangcheck progress detection

If the hangcheck timer expires, check if the fw's position in the
cmdstream has advanced (changed) since last timer expiration, and
allow it up to three additional "extensions" to it's alotted time.
The intention is to continue to catch "shader stuck in a loop" type
hangs quickly, but allow more time for things that are actually
making forward progress.

Because we need to sample the CP state twice to detect if there has
not been progress, this also cuts the the timer's duration in half.

v2: Fix typo (REG_A6XX_CP_CSQ_IB2_STAT), add comment
v3: Only halve hangcheck timer duration for generations which
support progress detection (hdanton); removed unused a5xx
progress (without knowing how to adjust for data buffered
in ROQ it is too likely to report a false negative)
v4: Comment updates to better describe the total hangcheck
duration when progress detection is applied

Reviewed-by: Chia-I Wu <olvaffe@gmail.com>
Tested-by: Chia-I Wu <olvaffe@gmail.com> # dEQP-GLES2.functional.flush_finish.wait
Signed-off-by: Rob Clark <robdclark@chromium.org>
Reviewed-by: Akhil P Oommen <quic_akhilpo@quicinc.com>
Patchwork: https://patchwork.freedesktop.org/patch/511584/
Link: https://lore.kernel.org/r/20221114193049.1533391-3-robdclark@gmail.com

Rob Clark 3 years ago d73b1d02 cade05b2

+109 -3

6 changed files

expand all

drivers

gpu

drm

msm

adreno

a6xx_gpu.c

msm_drv.c

msm_drv.h

msm_gpu.c

msm_gpu.h

msm_ringbuffer.h

+34

drivers/gpu/drm/msm/adreno/a6xx_gpu.c

··· 1843 1843 return ring->memptrs->rptr = gpu_read(gpu, REG_A6XX_CP_RB_RPTR); 1844 1844 } 1845 1845 1846 + static bool a6xx_progress(struct msm_gpu *gpu, struct msm_ringbuffer *ring) 1847 + { 1848 + struct msm_cp_state cp_state = { 1849 + .ib1_base = gpu_read64(gpu, REG_A6XX_CP_IB1_BASE), 1850 + .ib2_base = gpu_read64(gpu, REG_A6XX_CP_IB2_BASE), 1851 + .ib1_rem = gpu_read(gpu, REG_A6XX_CP_IB1_REM_SIZE), 1852 + .ib2_rem = gpu_read(gpu, REG_A6XX_CP_IB2_REM_SIZE), 1853 + }; 1854 + bool progress; 1855 + 1856 + /* 1857 + * Adjust the remaining data to account for what has already been 1858 + * fetched from memory, but not yet consumed by the SQE. 1859 + * 1860 + * This is not *technically* correct, the amount buffered could 1861 + * exceed the IB size due to hw prefetching ahead, but: 1862 + * 1863 + * (1) We aren't trying to find the exact position, just whether 1864 + * progress has been made 1865 + * (2) The CP_REG_TO_MEM at the end of a submit should be enough 1866 + * to prevent prefetching into an unrelated submit. (And 1867 + * either way, at some point the ROQ will be full.) 1868 + */ 1869 + cp_state.ib1_rem += gpu_read(gpu, REG_A6XX_CP_CSQ_IB1_STAT) >> 16; 1870 + cp_state.ib2_rem += gpu_read(gpu, REG_A6XX_CP_CSQ_IB2_STAT) >> 16; 1871 + 1872 + progress = !!memcmp(&cp_state, &ring->last_cp_state, sizeof(cp_state)); 1873 + 1874 + ring->last_cp_state = cp_state; 1875 + 1876 + return progress; 1877 + } 1878 + 1846 1879 static u32 a618_get_speed_bin(u32 fuse) 1847 1880 { 1848 1881 if (fuse == 0) ··· 1992 1959 .create_address_space = a6xx_create_address_space, 1993 1960 .create_private_address_space = a6xx_create_private_address_space, 1994 1961 .get_rptr = a6xx_get_rptr, 1962 + .progress = a6xx_progress, 1995 1963 }, 1996 1964 .get_timestamp = a6xx_get_timestamp, 1997 1965 };

-1

drivers/gpu/drm/msm/msm_drv.c

··· 419 419 priv->dev = ddev; 420 420 421 421 priv->wq = alloc_ordered_workqueue("msm", 0); 422 - priv->hangcheck_period = DRM_MSM_HANGCHECK_DEFAULT_PERIOD; 423 422 424 423 INIT_LIST_HEAD(&priv->objects); 425 424 mutex_init(&priv->obj_lock);

+7 -1

drivers/gpu/drm/msm/msm_drv.h

··· 224 224 225 225 struct drm_atomic_state *pm_state; 226 226 227 - /* For hang detection, in ms */ 227 + /** 228 + * hangcheck_period: For hang detection, in ms 229 + * 230 + * Note that in practice, a submit/job will get at least two hangcheck 231 + * periods, due to checking for progress being implemented as simply 232 + * "have the CP position registers changed since last time?" 233 + */ 228 234 unsigned int hangcheck_period; 229 235 230 236 /**

+30 -1

drivers/gpu/drm/msm/msm_gpu.c

··· 492 492 round_jiffies_up(jiffies + msecs_to_jiffies(priv->hangcheck_period))); 493 493 } 494 494 495 + static bool made_progress(struct msm_gpu *gpu, struct msm_ringbuffer *ring) 496 + { 497 + if (ring->hangcheck_progress_retries >= DRM_MSM_HANGCHECK_PROGRESS_RETRIES) 498 + return false; 499 + 500 + if (!gpu->funcs->progress) 501 + return false; 502 + 503 + if (!gpu->funcs->progress(gpu, ring)) 504 + return false; 505 + 506 + ring->hangcheck_progress_retries++; 507 + return true; 508 + } 509 + 495 510 static void hangcheck_handler(struct timer_list *t) 496 511 { 497 512 struct msm_gpu *gpu = from_timer(gpu, t, hangcheck_timer); ··· 517 502 if (fence != ring->hangcheck_fence) { 518 503 /* some progress has been made.. ya! */ 519 504 ring->hangcheck_fence = fence; 520 - } else if (fence_before(fence, ring->fctx->last_fence)) { 505 + ring->hangcheck_progress_retries = 0; 506 + } else if (fence_before(fence, ring->fctx->last_fence) && 507 + !made_progress(gpu, ring)) { 521 508 /* no progress and not done.. hung! */ 522 509 ring->hangcheck_fence = fence; 510 + ring->hangcheck_progress_retries = 0; 523 511 DRM_DEV_ERROR(dev->dev, "%s: hangcheck detected gpu lockup rb %d!\n", 524 512 gpu->name, ring->id); 525 513 DRM_DEV_ERROR(dev->dev, "%s: completed fence: %u\n", ··· 848 830 struct msm_gpu *gpu, const struct msm_gpu_funcs *funcs, 849 831 const char *name, struct msm_gpu_config *config) 850 832 { 833 + struct msm_drm_private *priv = drm->dev_private; 851 834 int i, ret, nr_rings = config->nr_rings; 852 835 void *memptrs; 853 836 uint64_t memptrs_iova; ··· 875 856 kthread_init_work(&gpu->retire_work, retire_worker); 876 857 kthread_init_work(&gpu->recover_work, recover_worker); 877 858 kthread_init_work(&gpu->fault_work, fault_worker); 859 + 860 + priv->hangcheck_period = DRM_MSM_HANGCHECK_DEFAULT_PERIOD; 861 + 862 + /* 863 + * If progress detection is supported, halve the hangcheck timer 864 + * duration, as it takes two iterations of the hangcheck handler 865 + * to detect a hang. 866 + */ 867 + if (funcs->progress) 868 + priv->hangcheck_period /= 2; 878 869 879 870 timer_setup(&gpu->hangcheck_timer, hangcheck_handler, 0); 880 871

+10

drivers/gpu/drm/msm/msm_gpu.h

··· 78 78 struct msm_gem_address_space *(*create_private_address_space) 79 79 (struct msm_gpu *gpu); 80 80 uint32_t (*get_rptr)(struct msm_gpu *gpu, struct msm_ringbuffer *ring); 81 + 82 + /** 83 + * progress: Has the GPU made progress? 84 + * 85 + * Return true if GPU position in cmdstream has advanced (or changed) 86 + * since the last call. To avoid false negatives, this should account 87 + * for cmdstream that is buffered in this FIFO upstream of the CP fw. 88 + */ 89 + bool (*progress)(struct msm_gpu *gpu, struct msm_ringbuffer *ring); 81 90 }; 82 91 83 92 /* Additional state for iommu faults: */ ··· 246 237 #define DRM_MSM_INACTIVE_PERIOD 66 /* in ms (roughly four frames) */ 247 238 248 239 #define DRM_MSM_HANGCHECK_DEFAULT_PERIOD 500 /* in ms */ 240 + #define DRM_MSM_HANGCHECK_PROGRESS_RETRIES 3 249 241 struct timer_list hangcheck_timer; 250 242 251 243 /* Fault info for most recent iova fault: */

+28

drivers/gpu/drm/msm/msm_ringbuffer.h

··· 35 35 volatile u64 ttbr0; 36 36 }; 37 37 38 + struct msm_cp_state { 39 + uint64_t ib1_base, ib2_base; 40 + uint32_t ib1_rem, ib2_rem; 41 + }; 42 + 38 43 struct msm_ringbuffer { 39 44 struct msm_gpu *gpu; 40 45 int id; ··· 68 63 struct msm_rbmemptrs *memptrs; 69 64 uint64_t memptrs_iova; 70 65 struct msm_fence_context *fctx; 66 + 67 + /** 68 + * hangcheck_progress_retries: 69 + * 70 + * The number of extra hangcheck duration cycles that we have given 71 + * due to it appearing that the GPU is making forward progress. 72 + * 73 + * For GPU generations which support progress detection (see. 74 + * msm_gpu_funcs::progress()), if the GPU appears to be making progress 75 + * (ie. the CP has advanced in the command stream, we'll allow up to 76 + * DRM_MSM_HANGCHECK_PROGRESS_RETRIES expirations of the hangcheck timer 77 + * before killing the job. But to detect progress we need two sample 78 + * points, so the duration of the hangcheck timer is halved. In other 79 + * words we'll let the submit run for up to: 80 + * 81 + * (DRM_MSM_HANGCHECK_DEFAULT_PERIOD / 2) * (DRM_MSM_HANGCHECK_PROGRESS_RETRIES + 1) 82 + */ 83 + int hangcheck_progress_retries; 84 + 85 + /** 86 + * last_cp_state: The state of the CP at the last call to gpu->progress() 87 + */ 88 + struct msm_cp_state last_cp_state; 71 89 72 90 /* 73 91 * preempt_lock protects preemption and serializes wptr updates against