Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/i915: Provide a timeout to i915_gem_wait_for_idle()

Usually we have no idea about the upper bound we need to wait to catch
up with userspace when idling the device, but in a few situations we
know the system was idle beforehand and can provide a short timeout in
order to very quickly catch a failure, long before hangcheck kicks in.

In the following patches, we will use the timeout to curtain two overly
long waits, where we know we can expect the GPU to complete within a
reasonable time or declare it broken.

In particular, with a broken GPU we expect it to fail during the initial
GPU setup where do a couple of context switches to record the defaults.
This is a task that takes a few milliseconds even on the slowest of
devices, but we may have to wait 60s for hangcheck to give in and
declare the machine inoperable. In this a case where any gpu hang is
unacceptable, both from a timeliness and practical standpoint.

The other improvement is that in selftests, we do not need to arm an
independent timer to inject a wedge, as we can just limit the timeout on
the wait directly.

v2: Include the timeout parameter in the trace.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180709122044.7028-1-chris@chris-wilson.co.uk

+59 -33
+4 -2
drivers/gpu/drm/i915/i915_debugfs.c
··· 4105 4105 4106 4106 err = i915_gem_wait_for_idle(i915, 4107 4107 I915_WAIT_LOCKED | 4108 - I915_WAIT_INTERRUPTIBLE); 4108 + I915_WAIT_INTERRUPTIBLE, 4109 + MAX_SCHEDULE_TIMEOUT); 4109 4110 if (err) 4110 4111 goto err_unlock; 4111 4112 ··· 4211 4210 if (val & DROP_ACTIVE) 4212 4211 ret = i915_gem_wait_for_idle(dev_priv, 4213 4212 I915_WAIT_INTERRUPTIBLE | 4214 - I915_WAIT_LOCKED); 4213 + I915_WAIT_LOCKED, 4214 + MAX_SCHEDULE_TIMEOUT); 4215 4215 4216 4216 if (val & DROP_RETIRE) 4217 4217 i915_retire_requests(dev_priv);
+1 -1
drivers/gpu/drm/i915/i915_drv.h
··· 3157 3157 void i915_gem_fini(struct drm_i915_private *dev_priv); 3158 3158 void i915_gem_cleanup_engines(struct drm_i915_private *dev_priv); 3159 3159 int i915_gem_wait_for_idle(struct drm_i915_private *dev_priv, 3160 - unsigned int flags); 3160 + unsigned int flags, long timeout); 3161 3161 int __must_check i915_gem_suspend(struct drm_i915_private *dev_priv); 3162 3162 void i915_gem_suspend_late(struct drm_i915_private *dev_priv); 3163 3163 void i915_gem_resume(struct drm_i915_private *dev_priv);
+29 -19
drivers/gpu/drm/i915/i915_gem.c
··· 2267 2267 2268 2268 /* Attempt to reap some mmap space from dead objects */ 2269 2269 do { 2270 - err = i915_gem_wait_for_idle(dev_priv, I915_WAIT_INTERRUPTIBLE); 2270 + err = i915_gem_wait_for_idle(dev_priv, 2271 + I915_WAIT_INTERRUPTIBLE, 2272 + MAX_SCHEDULE_TIMEOUT); 2271 2273 if (err) 2272 2274 break; 2273 2275 ··· 3744 3742 return ret; 3745 3743 } 3746 3744 3747 - static int wait_for_timeline(struct i915_timeline *tl, unsigned int flags) 3745 + static long wait_for_timeline(struct i915_timeline *tl, 3746 + unsigned int flags, long timeout) 3748 3747 { 3749 3748 struct i915_request *rq; 3750 - long ret; 3751 3749 3752 3750 rq = i915_gem_active_get_unlocked(&tl->last_request); 3753 3751 if (!rq) 3754 - return 0; 3752 + return timeout; 3755 3753 3756 3754 /* 3757 3755 * "Race-to-idle". ··· 3765 3763 if (flags & I915_WAIT_FOR_IDLE_BOOST) 3766 3764 gen6_rps_boost(rq, NULL); 3767 3765 3768 - ret = i915_request_wait(rq, flags, MAX_SCHEDULE_TIMEOUT); 3766 + timeout = i915_request_wait(rq, flags, timeout); 3769 3767 i915_request_put(rq); 3770 3768 3771 - return ret < 0 ? ret : 0; 3769 + return timeout; 3772 3770 } 3773 3771 3774 3772 static int wait_for_engines(struct drm_i915_private *i915) ··· 3784 3782 return 0; 3785 3783 } 3786 3784 3787 - int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags) 3785 + int i915_gem_wait_for_idle(struct drm_i915_private *i915, 3786 + unsigned int flags, long timeout) 3788 3787 { 3789 - GEM_TRACE("flags=%x (%s)\n", 3790 - flags, flags & I915_WAIT_LOCKED ? "locked" : "unlocked"); 3788 + GEM_TRACE("flags=%x (%s), timeout=%ld%s\n", 3789 + flags, flags & I915_WAIT_LOCKED ? "locked" : "unlocked", 3790 + timeout, timeout == MAX_SCHEDULE_TIMEOUT ? " (forever)" : ""); 3791 3791 3792 3792 /* If the device is asleep, we have no requests outstanding */ 3793 3793 if (!READ_ONCE(i915->gt.awake)) ··· 3802 3798 lockdep_assert_held(&i915->drm.struct_mutex); 3803 3799 3804 3800 list_for_each_entry(tl, &i915->gt.timelines, link) { 3805 - err = wait_for_timeline(tl, flags); 3806 - if (err) 3807 - return err; 3801 + timeout = wait_for_timeline(tl, flags, timeout); 3802 + if (timeout < 0) 3803 + return timeout; 3808 3804 } 3809 3805 3810 3806 err = wait_for_engines(i915); ··· 3816 3812 } else { 3817 3813 struct intel_engine_cs *engine; 3818 3814 enum intel_engine_id id; 3819 - int err; 3820 3815 3821 3816 for_each_engine(engine, i915, id) { 3822 - err = wait_for_timeline(&engine->timeline, flags); 3823 - if (err) 3824 - return err; 3817 + struct i915_timeline *tl = &engine->timeline; 3818 + 3819 + timeout = wait_for_timeline(tl, flags, timeout); 3820 + if (timeout < 0) 3821 + return timeout; 3825 3822 } 3826 3823 } 3827 3824 ··· 5057 5052 ret = i915_gem_wait_for_idle(dev_priv, 5058 5053 I915_WAIT_INTERRUPTIBLE | 5059 5054 I915_WAIT_LOCKED | 5060 - I915_WAIT_FOR_IDLE_BOOST); 5055 + I915_WAIT_FOR_IDLE_BOOST, 5056 + MAX_SCHEDULE_TIMEOUT); 5061 5057 if (ret && ret != -EIO) 5062 5058 goto err_unlock; 5063 5059 ··· 5362 5356 if (err) 5363 5357 goto err_active; 5364 5358 5365 - err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED); 5359 + err = i915_gem_wait_for_idle(i915, 5360 + I915_WAIT_LOCKED, 5361 + MAX_SCHEDULE_TIMEOUT); 5366 5362 if (err) 5367 5363 goto err_active; 5368 5364 ··· 5429 5421 if (WARN_ON(i915_gem_switch_to_kernel_context(i915))) 5430 5422 goto out_ctx; 5431 5423 5432 - if (WARN_ON(i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED))) 5424 + if (WARN_ON(i915_gem_wait_for_idle(i915, 5425 + I915_WAIT_LOCKED, 5426 + MAX_SCHEDULE_TIMEOUT))) 5433 5427 goto out_ctx; 5434 5428 5435 5429 i915_gem_contexts_lost(i915);
+2 -1
drivers/gpu/drm/i915/i915_gem_evict.c
··· 69 69 70 70 err = i915_gem_wait_for_idle(i915, 71 71 I915_WAIT_INTERRUPTIBLE | 72 - I915_WAIT_LOCKED); 72 + I915_WAIT_LOCKED, 73 + MAX_SCHEDULE_TIMEOUT); 73 74 if (err) 74 75 return err; 75 76
+1 -1
drivers/gpu/drm/i915/i915_gem_gtt.c
··· 2793 2793 struct i915_ggtt *ggtt = &dev_priv->ggtt; 2794 2794 2795 2795 if (unlikely(ggtt->do_idle_maps)) { 2796 - if (i915_gem_wait_for_idle(dev_priv, 0)) { 2796 + if (i915_gem_wait_for_idle(dev_priv, 0, MAX_SCHEDULE_TIMEOUT)) { 2797 2797 DRM_ERROR("Failed to wait for idle; VT'd may hang.\n"); 2798 2798 /* Wait a bit, in hopes it avoids the hang */ 2799 2799 udelay(10);
+8 -3
drivers/gpu/drm/i915/i915_gem_shrinker.c
··· 172 172 * we will free as much as we can and hope to get a second chance. 173 173 */ 174 174 if (flags & I915_SHRINK_ACTIVE) 175 - i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED); 175 + i915_gem_wait_for_idle(i915, 176 + I915_WAIT_LOCKED, 177 + MAX_SCHEDULE_TIMEOUT); 176 178 177 179 trace_i915_gem_shrink(i915, target, flags); 178 180 i915_retire_requests(i915); ··· 394 392 unsigned long timeout = jiffies + msecs_to_jiffies_timeout(timeout_ms); 395 393 396 394 do { 397 - if (i915_gem_wait_for_idle(i915, 0) == 0 && 395 + if (i915_gem_wait_for_idle(i915, 396 + 0, MAX_SCHEDULE_TIMEOUT) == 0 && 398 397 shrinker_lock(i915, unlock)) 399 398 break; 400 399 ··· 469 466 return NOTIFY_DONE; 470 467 471 468 /* Force everything onto the inactive lists */ 472 - ret = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED); 469 + ret = i915_gem_wait_for_idle(i915, 470 + I915_WAIT_LOCKED, 471 + MAX_SCHEDULE_TIMEOUT); 473 472 if (ret) 474 473 goto out; 475 474
+3 -1
drivers/gpu/drm/i915/i915_perf.c
··· 1836 1836 * So far the best way to work around this issue seems to be draining 1837 1837 * the GPU from any submitted work. 1838 1838 */ 1839 - ret = i915_gem_wait_for_idle(dev_priv, wait_flags); 1839 + ret = i915_gem_wait_for_idle(dev_priv, 1840 + wait_flags, 1841 + MAX_SCHEDULE_TIMEOUT); 1840 1842 if (ret) 1841 1843 goto out; 1842 1844
+4 -2
drivers/gpu/drm/i915/i915_request.c
··· 206 206 /* Carefully retire all requests without writing to the rings */ 207 207 ret = i915_gem_wait_for_idle(i915, 208 208 I915_WAIT_INTERRUPTIBLE | 209 - I915_WAIT_LOCKED); 209 + I915_WAIT_LOCKED, 210 + MAX_SCHEDULE_TIMEOUT); 210 211 if (ret) 211 212 return ret; 212 213 ··· 736 735 /* Ratelimit ourselves to prevent oom from malicious clients */ 737 736 ret = i915_gem_wait_for_idle(i915, 738 737 I915_WAIT_LOCKED | 739 - I915_WAIT_INTERRUPTIBLE); 738 + I915_WAIT_INTERRUPTIBLE, 739 + MAX_SCHEDULE_TIMEOUT); 740 740 if (ret) 741 741 goto err_unreserve; 742 742
+3 -1
drivers/gpu/drm/i915/selftests/i915_gem_context.c
··· 478 478 } 479 479 } 480 480 481 - err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED); 481 + err = i915_gem_wait_for_idle(i915, 482 + I915_WAIT_LOCKED, 483 + MAX_SCHEDULE_TIMEOUT); 482 484 if (err) 483 485 return err; 484 486
+3 -1
drivers/gpu/drm/i915/selftests/i915_request.c
··· 286 286 t->func = func; 287 287 t->name = name; 288 288 289 - err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED); 289 + err = i915_gem_wait_for_idle(i915, 290 + I915_WAIT_LOCKED, 291 + MAX_SCHEDULE_TIMEOUT); 290 292 if (err) { 291 293 pr_err("%s(%s): failed to idle before, with err=%d!", 292 294 func, name, err);
+1 -1
drivers/gpu/drm/i915/selftests/igt_flush_test.c
··· 64 64 } 65 65 66 66 wedge_on_timeout(&w, i915, HZ) 67 - i915_gem_wait_for_idle(i915, flags); 67 + i915_gem_wait_for_idle(i915, flags, MAX_SCHEDULE_TIMEOUT); 68 68 69 69 return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0; 70 70 }