Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
at v5.1 334 lines 9.3 kB view raw
1/* 2 * Copyright © 2016 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25#include "i915_drv.h" 26#include "i915_reset.h" 27 28struct hangcheck { 29 u64 acthd; 30 u32 seqno; 31 enum intel_engine_hangcheck_action action; 32 unsigned long action_timestamp; 33 int deadlock; 34 struct intel_instdone instdone; 35 bool wedged:1; 36 bool stalled:1; 37}; 38 39static bool instdone_unchanged(u32 current_instdone, u32 *old_instdone) 40{ 41 u32 tmp = current_instdone | *old_instdone; 42 bool unchanged; 43 44 unchanged = tmp == *old_instdone; 45 *old_instdone |= tmp; 46 47 return unchanged; 48} 49 50static bool subunits_stuck(struct intel_engine_cs *engine) 51{ 52 struct drm_i915_private *dev_priv = engine->i915; 53 struct intel_instdone instdone; 54 struct intel_instdone *accu_instdone = &engine->hangcheck.instdone; 55 bool stuck; 56 int slice; 57 int subslice; 58 59 if (engine->id != RCS) 60 return true; 61 62 intel_engine_get_instdone(engine, &instdone); 63 64 /* There might be unstable subunit states even when 65 * actual head is not moving. Filter out the unstable ones by 66 * accumulating the undone -> done transitions and only 67 * consider those as progress. 68 */ 69 stuck = instdone_unchanged(instdone.instdone, 70 &accu_instdone->instdone); 71 stuck &= instdone_unchanged(instdone.slice_common, 72 &accu_instdone->slice_common); 73 74 for_each_instdone_slice_subslice(dev_priv, slice, subslice) { 75 stuck &= instdone_unchanged(instdone.sampler[slice][subslice], 76 &accu_instdone->sampler[slice][subslice]); 77 stuck &= instdone_unchanged(instdone.row[slice][subslice], 78 &accu_instdone->row[slice][subslice]); 79 } 80 81 return stuck; 82} 83 84static enum intel_engine_hangcheck_action 85head_stuck(struct intel_engine_cs *engine, u64 acthd) 86{ 87 if (acthd != engine->hangcheck.acthd) { 88 89 /* Clear subunit states on head movement */ 90 memset(&engine->hangcheck.instdone, 0, 91 sizeof(engine->hangcheck.instdone)); 92 93 return ENGINE_ACTIVE_HEAD; 94 } 95 96 if (!subunits_stuck(engine)) 97 return ENGINE_ACTIVE_SUBUNITS; 98 99 return ENGINE_DEAD; 100} 101 102static enum intel_engine_hangcheck_action 103engine_stuck(struct intel_engine_cs *engine, u64 acthd) 104{ 105 struct drm_i915_private *dev_priv = engine->i915; 106 enum intel_engine_hangcheck_action ha; 107 u32 tmp; 108 109 ha = head_stuck(engine, acthd); 110 if (ha != ENGINE_DEAD) 111 return ha; 112 113 if (IS_GEN(dev_priv, 2)) 114 return ENGINE_DEAD; 115 116 /* Is the chip hanging on a WAIT_FOR_EVENT? 117 * If so we can simply poke the RB_WAIT bit 118 * and break the hang. This should work on 119 * all but the second generation chipsets. 120 */ 121 tmp = I915_READ_CTL(engine); 122 if (tmp & RING_WAIT) { 123 i915_handle_error(dev_priv, BIT(engine->id), 0, 124 "stuck wait on %s", engine->name); 125 I915_WRITE_CTL(engine, tmp); 126 return ENGINE_WAIT_KICK; 127 } 128 129 return ENGINE_DEAD; 130} 131 132static void hangcheck_load_sample(struct intel_engine_cs *engine, 133 struct hangcheck *hc) 134{ 135 hc->acthd = intel_engine_get_active_head(engine); 136 hc->seqno = intel_engine_get_seqno(engine); 137} 138 139static void hangcheck_store_sample(struct intel_engine_cs *engine, 140 const struct hangcheck *hc) 141{ 142 engine->hangcheck.acthd = hc->acthd; 143 engine->hangcheck.seqno = hc->seqno; 144} 145 146static enum intel_engine_hangcheck_action 147hangcheck_get_action(struct intel_engine_cs *engine, 148 const struct hangcheck *hc) 149{ 150 if (engine->hangcheck.seqno != hc->seqno) 151 return ENGINE_ACTIVE_SEQNO; 152 153 if (intel_engine_is_idle(engine)) 154 return ENGINE_IDLE; 155 156 return engine_stuck(engine, hc->acthd); 157} 158 159static void hangcheck_accumulate_sample(struct intel_engine_cs *engine, 160 struct hangcheck *hc) 161{ 162 unsigned long timeout = I915_ENGINE_DEAD_TIMEOUT; 163 164 hc->action = hangcheck_get_action(engine, hc); 165 166 /* We always increment the progress 167 * if the engine is busy and still processing 168 * the same request, so that no single request 169 * can run indefinitely (such as a chain of 170 * batches). The only time we do not increment 171 * the hangcheck score on this ring, if this 172 * engine is in a legitimate wait for another 173 * engine. In that case the waiting engine is a 174 * victim and we want to be sure we catch the 175 * right culprit. Then every time we do kick 176 * the ring, make it as a progress as the seqno 177 * advancement might ensure and if not, it 178 * will catch the hanging engine. 179 */ 180 181 switch (hc->action) { 182 case ENGINE_IDLE: 183 case ENGINE_ACTIVE_SEQNO: 184 /* Clear head and subunit states on seqno movement */ 185 hc->acthd = 0; 186 187 memset(&engine->hangcheck.instdone, 0, 188 sizeof(engine->hangcheck.instdone)); 189 190 /* Intentional fall through */ 191 case ENGINE_WAIT_KICK: 192 case ENGINE_WAIT: 193 engine->hangcheck.action_timestamp = jiffies; 194 break; 195 196 case ENGINE_ACTIVE_HEAD: 197 case ENGINE_ACTIVE_SUBUNITS: 198 /* 199 * Seqno stuck with still active engine gets leeway, 200 * in hopes that it is just a long shader. 201 */ 202 timeout = I915_SEQNO_DEAD_TIMEOUT; 203 break; 204 205 case ENGINE_DEAD: 206 break; 207 208 default: 209 MISSING_CASE(hc->action); 210 } 211 212 hc->stalled = time_after(jiffies, 213 engine->hangcheck.action_timestamp + timeout); 214 hc->wedged = time_after(jiffies, 215 engine->hangcheck.action_timestamp + 216 I915_ENGINE_WEDGED_TIMEOUT); 217} 218 219static void hangcheck_declare_hang(struct drm_i915_private *i915, 220 unsigned int hung, 221 unsigned int stuck) 222{ 223 struct intel_engine_cs *engine; 224 char msg[80]; 225 unsigned int tmp; 226 int len; 227 228 /* If some rings hung but others were still busy, only 229 * blame the hanging rings in the synopsis. 230 */ 231 if (stuck != hung) 232 hung &= ~stuck; 233 len = scnprintf(msg, sizeof(msg), 234 "%s on ", stuck == hung ? "no progress" : "hang"); 235 for_each_engine_masked(engine, i915, hung, tmp) 236 len += scnprintf(msg + len, sizeof(msg) - len, 237 "%s, ", engine->name); 238 msg[len-2] = '\0'; 239 240 return i915_handle_error(i915, hung, I915_ERROR_CAPTURE, "%s", msg); 241} 242 243/* 244 * This is called when the chip hasn't reported back with completed 245 * batchbuffers in a long time. We keep track per ring seqno progress and 246 * if there are no progress, hangcheck score for that ring is increased. 247 * Further, acthd is inspected to see if the ring is stuck. On stuck case 248 * we kick the ring. If we see no progress on three subsequent calls 249 * we assume chip is wedged and try to fix it by resetting the chip. 250 */ 251static void i915_hangcheck_elapsed(struct work_struct *work) 252{ 253 struct drm_i915_private *dev_priv = 254 container_of(work, typeof(*dev_priv), 255 gpu_error.hangcheck_work.work); 256 struct intel_engine_cs *engine; 257 enum intel_engine_id id; 258 unsigned int hung = 0, stuck = 0, wedged = 0; 259 260 if (!i915_modparams.enable_hangcheck) 261 return; 262 263 if (!READ_ONCE(dev_priv->gt.awake)) 264 return; 265 266 if (i915_terminally_wedged(&dev_priv->gpu_error)) 267 return; 268 269 /* As enabling the GPU requires fairly extensive mmio access, 270 * periodically arm the mmio checker to see if we are triggering 271 * any invalid access. 272 */ 273 intel_uncore_arm_unclaimed_mmio_detection(dev_priv); 274 275 for_each_engine(engine, dev_priv, id) { 276 struct hangcheck hc; 277 278 intel_engine_signal_breadcrumbs(engine); 279 280 hangcheck_load_sample(engine, &hc); 281 hangcheck_accumulate_sample(engine, &hc); 282 hangcheck_store_sample(engine, &hc); 283 284 if (hc.stalled) { 285 hung |= intel_engine_flag(engine); 286 if (hc.action != ENGINE_DEAD) 287 stuck |= intel_engine_flag(engine); 288 } 289 290 if (hc.wedged) 291 wedged |= intel_engine_flag(engine); 292 } 293 294 if (GEM_SHOW_DEBUG() && (hung | stuck)) { 295 struct drm_printer p = drm_debug_printer("hangcheck"); 296 297 for_each_engine(engine, dev_priv, id) { 298 if (intel_engine_is_idle(engine)) 299 continue; 300 301 intel_engine_dump(engine, &p, "%s\n", engine->name); 302 } 303 } 304 305 if (wedged) { 306 dev_err(dev_priv->drm.dev, 307 "GPU recovery timed out," 308 " cancelling all in-flight rendering.\n"); 309 GEM_TRACE_DUMP(); 310 i915_gem_set_wedged(dev_priv); 311 } 312 313 if (hung) 314 hangcheck_declare_hang(dev_priv, hung, stuck); 315 316 /* Reset timer in case GPU hangs without another request being added */ 317 i915_queue_hangcheck(dev_priv); 318} 319 320void intel_engine_init_hangcheck(struct intel_engine_cs *engine) 321{ 322 memset(&engine->hangcheck, 0, sizeof(engine->hangcheck)); 323 engine->hangcheck.action_timestamp = jiffies; 324} 325 326void intel_hangcheck_init(struct drm_i915_private *i915) 327{ 328 INIT_DELAYED_WORK(&i915->gpu_error.hangcheck_work, 329 i915_hangcheck_elapsed); 330} 331 332#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 333#include "selftests/intel_hangcheck.c" 334#endif