drm/i915/gem: Don't leak non-persistent requests on changing engines

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

If we have a set of active engines marked as being non-persistent, we
lose track of those if the user replaces those engines with
I915_CONTEXT_PARAM_ENGINES. As part of our uABI contract is that
non-persistent requests are terminated if they are no longer being
tracked by the user's context (in order to prevent a lost request
causing an untracked and so unstoppable GPU hang), we need to apply the
same context cancellation upon changing engines.

v2: Track stale engines[] so we only reap at context closure.
v3: Tvrtko spotted races with closing contexts and set-engines, so add a
veneer of kill-everything paranoia to clean up after losing a race.

Fixes: a0e047156cde ("drm/i915/gem: Make context persistence optional")
Testcase: igt/gem_ctx_peristence/replace
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200211144831.1011498-1-chris@chris-wilson.co.uk

Chris Wilson 6 years ago 42fb60de 0b02f97f

+141 -13

4 changed files

expand all

drivers

gpu

drm

i915

gem

i915_gem_context.c

i915_gem_context_types.h

i915_sw_fence.c

i915_sw_fence.h

+114 -8

drivers/gpu/drm/i915/gem/i915_gem_context.c

··· 270 270 if (!e) 271 271 return ERR_PTR(-ENOMEM); 272 272 273 - init_rcu_head(&e->rcu); 273 + e->ctx = ctx; 274 + 274 275 for_each_engine(engine, gt, id) { 275 276 struct intel_context *ce; 276 277 ··· 451 450 return engine; 452 451 } 453 452 454 - static void kill_context(struct i915_gem_context *ctx) 453 + static void kill_engines(struct i915_gem_engines *engines) 455 454 { 456 455 struct i915_gem_engines_iter it; 457 456 struct intel_context *ce; ··· 463 462 * However, we only care about pending requests, so only include 464 463 * engines on which there are incomplete requests. 465 464 */ 466 - for_each_gem_engine(ce, __context_engines_static(ctx), it) { 465 + for_each_gem_engine(ce, engines, it) { 467 466 struct intel_engine_cs *engine; 468 467 469 468 if (intel_context_set_banned(ce)) ··· 485 484 * the context from the GPU, we have to resort to a full 486 485 * reset. We hope the collateral damage is worth it. 487 486 */ 488 - __reset_context(ctx, engine); 487 + __reset_context(engines->ctx, engine); 489 488 } 489 + } 490 + 491 + static void kill_stale_engines(struct i915_gem_context *ctx) 492 + { 493 + struct i915_gem_engines *pos, *next; 494 + unsigned long flags; 495 + 496 + spin_lock_irqsave(&ctx->stale.lock, flags); 497 + list_for_each_entry_safe(pos, next, &ctx->stale.engines, link) { 498 + if (!i915_sw_fence_await(&pos->fence)) 499 + continue; 500 + 501 + spin_unlock_irqrestore(&ctx->stale.lock, flags); 502 + 503 + kill_engines(pos); 504 + 505 + spin_lock_irqsave(&ctx->stale.lock, flags); 506 + list_safe_reset_next(pos, next, link); 507 + list_del_init(&pos->link); /* decouple from FENCE_COMPLETE */ 508 + 509 + i915_sw_fence_complete(&pos->fence); 510 + } 511 + spin_unlock_irqrestore(&ctx->stale.lock, flags); 512 + } 513 + 514 + static void kill_context(struct i915_gem_context *ctx) 515 + { 516 + kill_stale_engines(ctx); 517 + kill_engines(__context_engines_static(ctx)); 490 518 } 491 519 492 520 static void set_closed_name(struct i915_gem_context *ctx) ··· 631 601 ctx->i915 = i915; 632 602 ctx->sched.priority = I915_USER_PRIORITY(I915_PRIORITY_NORMAL); 633 603 mutex_init(&ctx->mutex); 604 + 605 + spin_lock_init(&ctx->stale.lock); 606 + INIT_LIST_HEAD(&ctx->stale.engines); 634 607 635 608 mutex_init(&ctx->engines_mutex); 636 609 e = default_engines(ctx); ··· 1562 1529 [I915_CONTEXT_ENGINES_EXT_BOND] = set_engines__bond, 1563 1530 }; 1564 1531 1532 + static int engines_notify(struct i915_sw_fence *fence, 1533 + enum i915_sw_fence_notify state) 1534 + { 1535 + struct i915_gem_engines *engines = 1536 + container_of(fence, typeof(*engines), fence); 1537 + 1538 + switch (state) { 1539 + case FENCE_COMPLETE: 1540 + if (!list_empty(&engines->link)) { 1541 + struct i915_gem_context *ctx = engines->ctx; 1542 + unsigned long flags; 1543 + 1544 + spin_lock_irqsave(&ctx->stale.lock, flags); 1545 + list_del(&engines->link); 1546 + spin_unlock_irqrestore(&ctx->stale.lock, flags); 1547 + } 1548 + break; 1549 + 1550 + case FENCE_FREE: 1551 + init_rcu_head(&engines->rcu); 1552 + call_rcu(&engines->rcu, free_engines_rcu); 1553 + break; 1554 + } 1555 + 1556 + return NOTIFY_DONE; 1557 + } 1558 + 1559 + static void engines_idle_release(struct i915_gem_engines *engines) 1560 + { 1561 + struct i915_gem_engines_iter it; 1562 + struct intel_context *ce; 1563 + unsigned long flags; 1564 + 1565 + GEM_BUG_ON(!engines); 1566 + i915_sw_fence_init(&engines->fence, engines_notify); 1567 + 1568 + INIT_LIST_HEAD(&engines->link); 1569 + spin_lock_irqsave(&engines->ctx->stale.lock, flags); 1570 + if (!i915_gem_context_is_closed(engines->ctx)) 1571 + list_add(&engines->link, &engines->ctx->stale.engines); 1572 + spin_unlock_irqrestore(&engines->ctx->stale.lock, flags); 1573 + if (list_empty(&engines->link)) /* raced, already closed */ 1574 + goto kill; 1575 + 1576 + for_each_gem_engine(ce, engines, it) { 1577 + struct dma_fence *fence; 1578 + int err; 1579 + 1580 + if (!ce->timeline) 1581 + continue; 1582 + 1583 + fence = i915_active_fence_get(&ce->timeline->last_request); 1584 + if (!fence) 1585 + continue; 1586 + 1587 + err = i915_sw_fence_await_dma_fence(&engines->fence, 1588 + fence, 0, 1589 + GFP_KERNEL); 1590 + 1591 + dma_fence_put(fence); 1592 + if (err < 0) 1593 + goto kill; 1594 + } 1595 + goto out; 1596 + 1597 + kill: 1598 + kill_engines(engines); 1599 + out: 1600 + i915_sw_fence_commit(&engines->fence); 1601 + } 1602 + 1565 1603 static int 1566 1604 set_engines(struct i915_gem_context *ctx, 1567 1605 const struct drm_i915_gem_context_param *args) ··· 1675 1571 if (!set.engines) 1676 1572 return -ENOMEM; 1677 1573 1678 - init_rcu_head(&set.engines->rcu); 1574 + set.engines->ctx = ctx; 1575 + 1679 1576 for (n = 0; n < num_engines; n++) { 1680 1577 struct i915_engine_class_instance ci; 1681 1578 struct intel_engine_cs *engine; ··· 1736 1631 set.engines = rcu_replace_pointer(ctx->engines, set.engines, 1); 1737 1632 mutex_unlock(&ctx->engines_mutex); 1738 1633 1739 - call_rcu(&set.engines->rcu, free_engines_rcu); 1634 + /* Keep track of old engine sets for kill_context() */ 1635 + engines_idle_release(set.engines); 1740 1636 1741 1637 return 0; 1742 1638 } ··· 1752 1646 if (!copy) 1753 1647 return ERR_PTR(-ENOMEM); 1754 1648 1755 - init_rcu_head(&copy->rcu); 1756 1649 for (n = 0; n < e->num_engines; n++) { 1757 1650 if (e->engines[n]) 1758 1651 copy->engines[n] = intel_context_get(e->engines[n]); ··· 1995 1890 if (!clone) 1996 1891 goto err_unlock; 1997 1892 1998 - init_rcu_head(&clone->rcu); 1893 + clone->ctx = dst; 1894 + 1999 1895 for (n = 0; n < e->num_engines; n++) { 2000 1896 struct intel_engine_cs *engine; 2001 1897

+12 -1

drivers/gpu/drm/i915/gem/i915_gem_context_types.h

··· 20 20 #include "gt/intel_context_types.h" 21 21 22 22 #include "i915_scheduler.h" 23 + #include "i915_sw_fence.h" 23 24 24 25 struct pid; 25 26 ··· 31 30 struct intel_ring; 32 31 33 32 struct i915_gem_engines { 34 - struct rcu_head rcu; 33 + union { 34 + struct list_head link; 35 + struct rcu_head rcu; 36 + }; 37 + struct i915_sw_fence fence; 38 + struct i915_gem_context *ctx; 35 39 unsigned int num_engines; 36 40 struct intel_context *engines[]; 37 41 }; ··· 179 173 * context in messages. 180 174 */ 181 175 char name[TASK_COMM_LEN + 8]; 176 + 177 + struct { 178 + struct spinlock lock; 179 + struct list_head engines; 180 + } stale; 182 181 }; 183 182 184 183 #endif /* __I915_GEM_CONTEXT_TYPES_H__ */

+14 -3

drivers/gpu/drm/i915/i915_sw_fence.c

··· 211 211 __i915_sw_fence_complete(fence, NULL); 212 212 } 213 213 214 - void i915_sw_fence_await(struct i915_sw_fence *fence) 214 + bool i915_sw_fence_await(struct i915_sw_fence *fence) 215 215 { 216 - debug_fence_assert(fence); 217 - WARN_ON(atomic_inc_return(&fence->pending) <= 1); 216 + int pending; 217 + 218 + /* 219 + * It is only safe to add a new await to the fence while it has 220 + * not yet been signaled (i.e. there are still existing signalers). 221 + */ 222 + pending = atomic_read(&fence->pending); 223 + do { 224 + if (pending < 1) 225 + return false; 226 + } while (!atomic_try_cmpxchg(&fence->pending, &pending, pending + 1)); 227 + 228 + return true; 218 229 } 219 230 220 231 void __i915_sw_fence_init(struct i915_sw_fence *fence,

+1 -1

drivers/gpu/drm/i915/i915_sw_fence.h

··· 91 91 unsigned long timeout, 92 92 gfp_t gfp); 93 93 94 - void i915_sw_fence_await(struct i915_sw_fence *fence); 94 + bool i915_sw_fence_await(struct i915_sw_fence *fence); 95 95 void i915_sw_fence_complete(struct i915_sw_fence *fence); 96 96 97 97 static inline bool i915_sw_fence_signaled(const struct i915_sw_fence *fence)