drm/i915: Serialise i915_active_fence_set() with itself

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

The expected downside to commit 58b4c1a07ada ("drm/i915: Reduce nested
prepare_remote_context() to a trylock") was that it would need to return
-EAGAIN to userspace in order to resolve potential mutex inversion. Such
an unsightly round trip is unnecessary if we could atomically insert a
barrier into the i915_active_fence, so make it happen.

Currently, we use the timeline->mutex (or some other named outer lock)
to order insertion into the i915_active_fence (and so individual nodes
of i915_active). Inside __i915_active_fence_set, we only need then
serialise with the interrupt handler in order to claim the timeline for
ourselves.

However, if we remove the outer lock, we need to ensure the order is
intact between not only multiple threads trying to insert themselves
into the timeline, but also with the interrupt handler completing the
previous occupant. We use xchg() on insert so that we have an ordered
sequence of insertions (and each caller knows the previous fence on
which to wait, preserving the chain of all fences in the timeline), but
we then have to cmpxchg() in the interrupt handler to avoid overwriting
the new occupant. The only nasty side-effect is having to temporarily
strip off the RCU-annotations to apply the atomic operations, otherwise
the rules are much more conventional!

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=112402
Fixes: 58b4c1a07ada ("drm/i915: Reduce nested prepare_remote_context() to a trylock")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20191127134527.3438410-1-chris@chris-wilson.co.uk

Chris Wilson 6 years ago df9f85d8 730eaeb5

+62 -102

7 changed files

expand all

drivers

gpu

drm

i915

intel_context.c

intel_engine_pm.c

intel_timeline.c

selftests

mock_timeline.c

i915_active.c

i915_active.h

i915_active_types.h

-19

drivers/gpu/drm/i915/gt/intel_context.c

··· 310 310 GEM_BUG_ON(rq->hw_context == ce); 311 311 312 312 if (rcu_access_pointer(rq->timeline) != tl) { /* timeline sharing! */ 313 - /* 314 - * Ideally, we just want to insert our foreign fence as 315 - * a barrier into the remove context, such that this operation 316 - * occurs after all current operations in that context, and 317 - * all future operations must occur after this. 318 - * 319 - * Currently, the timeline->last_request tracking is guarded 320 - * by its mutex and so we must obtain that to atomically 321 - * insert our barrier. However, since we already hold our 322 - * timeline->mutex, we must be careful against potential 323 - * inversion if we are the kernel_context as the remote context 324 - * will itself poke at the kernel_context when it needs to 325 - * unpin. Ergo, if already locked, we drop both locks and 326 - * try again (through the magic of userspace repeating EAGAIN). 327 - */ 328 - if (!mutex_trylock(&tl->mutex)) 329 - return -EAGAIN; 330 - 331 313 /* Queue this switch after current activity by this context. */ 332 314 err = i915_active_fence_set(&tl->last_request, rq); 333 - mutex_unlock(&tl->mutex); 334 315 if (err) 335 316 return err; 336 317 }

+1 -1

drivers/gpu/drm/i915/gt/intel_engine_pm.c

··· 183 183 container_of((struct list_head *)node, 184 184 typeof(*cb), node); 185 185 186 - cb->func(NULL, cb); 186 + cb->func(ERR_PTR(-EAGAIN), cb); 187 187 } 188 188 } 189 189

+1 -1

drivers/gpu/drm/i915/gt/intel_timeline.c

··· 254 254 255 255 mutex_init(&timeline->mutex); 256 256 257 - INIT_ACTIVE_FENCE(&timeline->last_request, &timeline->mutex); 257 + INIT_ACTIVE_FENCE(&timeline->last_request); 258 258 INIT_LIST_HEAD(&timeline->requests); 259 259 260 260 i915_syncmap_init(&timeline->sync);

+1 -1

drivers/gpu/drm/i915/gt/selftests/mock_timeline.c

··· 15 15 16 16 mutex_init(&timeline->mutex); 17 17 18 - INIT_ACTIVE_FENCE(&timeline->last_request, &timeline->mutex); 18 + INIT_ACTIVE_FENCE(&timeline->last_request); 19 19 INIT_LIST_HEAD(&timeline->requests); 20 20 21 21 i915_syncmap_init(&timeline->sync);

+57 -50

drivers/gpu/drm/i915/i915_active.c

··· 186 186 __active_retire(ref); 187 187 } 188 188 189 + static inline struct dma_fence ** 190 + __active_fence_slot(struct i915_active_fence *active) 191 + { 192 + return (struct dma_fence ** __force)&active->fence; 193 + } 194 + 195 + static inline bool 196 + active_fence_cb(struct dma_fence *fence, struct dma_fence_cb *cb) 197 + { 198 + struct i915_active_fence *active = 199 + container_of(cb, typeof(*active), cb); 200 + 201 + return cmpxchg(__active_fence_slot(active), fence, NULL) == fence; 202 + } 203 + 189 204 static void 190 205 node_retire(struct dma_fence *fence, struct dma_fence_cb *cb) 191 206 { 192 - i915_active_fence_cb(fence, cb); 193 - active_retire(container_of(cb, struct active_node, base.cb)->ref); 207 + if (active_fence_cb(fence, cb)) 208 + active_retire(container_of(cb, struct active_node, base.cb)->ref); 194 209 } 195 210 196 211 static void 197 212 excl_retire(struct dma_fence *fence, struct dma_fence_cb *cb) 198 213 { 199 - i915_active_fence_cb(fence, cb); 200 - active_retire(container_of(cb, struct i915_active, excl.cb)); 214 + if (active_fence_cb(fence, cb)) 215 + active_retire(container_of(cb, struct i915_active, excl.cb)); 201 216 } 202 217 203 218 static struct i915_active_fence * ··· 259 244 } 260 245 261 246 node = prealloc; 262 - __i915_active_fence_init(&node->base, &tl->mutex, NULL, node_retire); 247 + __i915_active_fence_init(&node->base, NULL, node_retire); 263 248 node->ref = ref; 264 249 node->timeline = idx; 265 250 ··· 296 281 init_llist_head(&ref->preallocated_barriers); 297 282 atomic_set(&ref->count, 0); 298 283 __mutex_init(&ref->mutex, "i915_active", key); 299 - __i915_active_fence_init(&ref->excl, &ref->mutex, NULL, excl_retire); 284 + __i915_active_fence_init(&ref->excl, NULL, excl_retire); 300 285 INIT_WORK(&ref->work, active_work); 301 286 } 302 287 ··· 391 376 /* We expect the caller to manage the exclusive timeline ordering */ 392 377 GEM_BUG_ON(i915_active_is_idle(ref)); 393 378 394 - /* 395 - * As we don't know which mutex the caller is using, we told a small 396 - * lie to the debug code that it is using the i915_active.mutex; 397 - * and now we must stick to that lie. 398 - */ 399 - mutex_acquire(&ref->mutex.dep_map, 0, 0, _THIS_IP_); 400 379 if (!__i915_active_fence_set(&ref->excl, f)) 401 380 atomic_inc(&ref->count); 402 - mutex_release(&ref->mutex.dep_map, 0, _THIS_IP_); 403 381 } 404 382 405 383 bool i915_active_acquire_if_busy(struct i915_active *ref) ··· 623 615 goto unwind; 624 616 } 625 617 626 - #if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM) 627 - node->base.lock = 628 - &engine->kernel_context->timeline->mutex; 629 - #endif 630 618 RCU_INIT_POINTER(node->base.fence, NULL); 631 619 node->base.cb.func = node_retire; 632 620 node->timeline = idx; ··· 643 639 node->base.cb.node.prev = (void *)engine; 644 640 atomic_inc(&ref->count); 645 641 } 642 + GEM_BUG_ON(rcu_access_pointer(node->base.fence) != ERR_PTR(-EAGAIN)); 646 643 647 644 GEM_BUG_ON(barrier_to_engine(node) != engine); 648 645 llist_add(barrier_to_ll(node), &ref->preallocated_barriers); ··· 707 702 } 708 703 } 709 704 705 + static struct dma_fence **ll_to_fence_slot(struct llist_node *node) 706 + { 707 + return __active_fence_slot(&barrier_from_ll(node)->base); 708 + } 709 + 710 710 void i915_request_add_active_barriers(struct i915_request *rq) 711 711 { 712 712 struct intel_engine_cs *engine = rq->engine; ··· 731 721 */ 732 722 spin_lock_irqsave(&rq->lock, flags); 733 723 llist_for_each_safe(node, next, node) { 734 - RCU_INIT_POINTER(barrier_from_ll(node)->base.fence, &rq->fence); 735 - smp_wmb(); /* serialise with reuse_idle_barrier */ 724 + /* serialise with reuse_idle_barrier */ 725 + smp_store_mb(*ll_to_fence_slot(node), &rq->fence); 736 726 list_add_tail((struct list_head *)node, &rq->fence.cb_list); 737 727 } 738 728 spin_unlock_irqrestore(&rq->lock, flags); 739 729 } 740 - 741 - #if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM) 742 - #define active_is_held(active) lockdep_is_held((active)->lock) 743 - #else 744 - #define active_is_held(active) true 745 - #endif 746 730 747 731 /* 748 732 * __i915_active_fence_set: Update the last active fence along its timeline ··· 748 744 * fence onto this one. Returns the previous fence (if not already completed), 749 745 * which the caller must ensure is executed before the new fence. To ensure 750 746 * that the order of fences within the timeline of the i915_active_fence is 751 - * maintained, it must be locked by the caller. 747 + * understood, it should be locked by the caller. 752 748 */ 753 749 struct dma_fence * 754 750 __i915_active_fence_set(struct i915_active_fence *active, ··· 757 753 struct dma_fence *prev; 758 754 unsigned long flags; 759 755 760 - /* NB: must be serialised by an outer timeline mutex (active->lock) */ 761 - spin_lock_irqsave(fence->lock, flags); 756 + if (fence == rcu_access_pointer(active->fence)) 757 + return fence; 758 + 762 759 GEM_BUG_ON(test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)); 763 760 764 - prev = rcu_dereference_protected(active->fence, active_is_held(active)); 761 + /* 762 + * Consider that we have two threads arriving (A and B), with 763 + * C already resident as the active->fence. 764 + * 765 + * A does the xchg first, and so it sees C or NULL depending 766 + * on the timing of the interrupt handler. If it is NULL, the 767 + * previous fence must have been signaled and we know that 768 + * we are first on the timeline. If it is still present, 769 + * we acquire the lock on that fence and serialise with the interrupt 770 + * handler, in the process removing it from any future interrupt 771 + * callback. A will then wait on C before executing (if present). 772 + * 773 + * As B is second, it sees A as the previous fence and so waits for 774 + * it to complete its transition and takes over the occupancy for 775 + * itself -- remembering that it needs to wait on A before executing. 776 + * 777 + * Note the strong ordering of the timeline also provides consistent 778 + * nesting rules for the fence->lock; the inner lock is always the 779 + * older lock. 780 + */ 781 + spin_lock_irqsave(fence->lock, flags); 782 + prev = xchg(__active_fence_slot(active), fence); 765 783 if (prev) { 766 784 GEM_BUG_ON(prev == fence); 767 785 spin_lock_nested(prev->lock, SINGLE_DEPTH_NESTING); 768 786 __list_del_entry(&active->cb.node); 769 787 spin_unlock(prev->lock); /* serialise with prev->cb_list */ 770 - 771 - /* 772 - * active->fence is reset by the callback from inside 773 - * interrupt context. We need to serialise our list 774 - * manipulation with the fence->lock to prevent the prev 775 - * being lost inside an interrupt (it can't be replaced as 776 - * no other caller is allowed to enter __i915_active_fence_set 777 - * as we hold the timeline lock). After serialising with 778 - * the callback, we need to double check which ran first, 779 - * our list_del() [decoupling prev from the callback] or 780 - * the callback... 781 - */ 782 - prev = rcu_access_pointer(active->fence); 783 788 } 784 - 785 - rcu_assign_pointer(active->fence, fence); 789 + GEM_BUG_ON(rcu_access_pointer(active->fence) != fence); 786 790 list_add_tail(&active->cb.node, &fence->cb_list); 787 - 788 791 spin_unlock_irqrestore(fence->lock, flags); 789 792 790 793 return prev; ··· 802 791 { 803 792 struct dma_fence *fence; 804 793 int err = 0; 805 - 806 - #if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM) 807 - lockdep_assert_held(active->lock); 808 - #endif 809 794 810 795 /* Must maintain timeline ordering wrt previous active requests */ 811 796 rcu_read_lock(); ··· 819 812 820 813 void i915_active_noop(struct dma_fence *fence, struct dma_fence_cb *cb) 821 814 { 822 - i915_active_fence_cb(fence, cb); 815 + active_fence_cb(fence, cb); 823 816 } 824 817 825 818 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)

+2 -15

drivers/gpu/drm/i915/i915_active.h

··· 61 61 */ 62 62 static inline void 63 63 __i915_active_fence_init(struct i915_active_fence *active, 64 - struct mutex *lock, 65 64 void *fence, 66 65 dma_fence_func_t fn) 67 66 { 68 67 RCU_INIT_POINTER(active->fence, fence); 69 68 active->cb.func = fn ?: i915_active_noop; 70 - #if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM) 71 - active->lock = lock; 72 - #endif 73 69 } 74 70 75 - #define INIT_ACTIVE_FENCE(A, LOCK) \ 76 - __i915_active_fence_init((A), (LOCK), NULL, NULL) 71 + #define INIT_ACTIVE_FENCE(A) \ 72 + __i915_active_fence_init((A), NULL, NULL) 77 73 78 74 struct dma_fence * 79 75 __i915_active_fence_set(struct i915_active_fence *active, ··· 121 125 i915_active_fence_isset(const struct i915_active_fence *active) 122 126 { 123 127 return rcu_access_pointer(active->fence); 124 - } 125 - 126 - static inline void 127 - i915_active_fence_cb(struct dma_fence *fence, struct dma_fence_cb *cb) 128 - { 129 - struct i915_active_fence *active = 130 - container_of(cb, typeof(*active), cb); 131 - 132 - RCU_INIT_POINTER(active->fence, NULL); 133 128 } 134 129 135 130 /*

-15

drivers/gpu/drm/i915/i915_active_types.h

··· 20 20 struct i915_active_fence { 21 21 struct dma_fence __rcu *fence; 22 22 struct dma_fence_cb cb; 23 - #if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM) 24 - /* 25 - * Incorporeal! 26 - * 27 - * Updates to the i915_active_request must be serialised under a lock 28 - * to ensure that the timeline is ordered. Normally, this is the 29 - * timeline->mutex, but another mutex may be used so long as it is 30 - * done so consistently. 31 - * 32 - * For lockdep tracking of the above, we store the lock we intend 33 - * to always use for updates of this i915_active_request during 34 - * construction and assert that is held on every update. 35 - */ 36 - struct mutex *lock; 37 - #endif 38 23 }; 39 24 40 25 struct active_node;