Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/i915/ringbuffer: Fix context restore upon reset

The discovery with trying to enable full-ppgtt was that we were
completely failing to the load both the mm and context following the
reset. Although we were performing mmio to set the PP_DIR (per-process
GTT) and CCID (context), these were taking no effect (the assumption was
that this would trigger reload of the context and restore the page
tables). It was not until we performed the LRI + MI_SET_CONTEXT in a
following context switch would anything occur.

Since we are then required to reset the context image and PP_DIR using
CS commands, we place those commands into every batch. The hardware
should recognise the no-ops and eliminate the expensive context loads,
but we still have to pay the cost of using cross-powerwell register
writes. In practice, this has no effect on actual context switch times,
and only adds a few hundred nanoseconds to no-op switches. We can improve
the latter by eliminating the w/a around known no-op switches, but there
is an ulterior motive to keeping them.

Always emitting the context switch at the beginning of the request (and
relying on HW to skip unneeded switches) does have one key advantage.
Should we implement request reordering on Haswell, we will not know in
advance what the previous executing context was on the GPU and so we
would not be able to elide the MI_SET_CONTEXT commands ourselves and
always have to emit them. Having our hand forced now actually prepares
us for later.

Now since that context and mm follow the request, we no longer (and not
for a long time since requests took over!) require a trace point to tell
when we write the switch into the ring, since it is always. (This is
even more important when you remember that simply writing into the ring
bears no relation to the current mm.)

v2: Sandybridge has to agree to use LRI as well.

Testcase: igt/drv_selftests/live_hangcheck
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Matthew Auld <matthew.william.auld@gmail.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180611110845.31890-1-chris@chris-wilson.co.uk

+66 -156
-45
drivers/gpu/drm/i915/i915_gem_gtt.c
··· 1712 1712 wmb(); 1713 1713 } 1714 1714 1715 - static inline u32 get_pd_offset(struct i915_hw_ppgtt *ppgtt) 1716 - { 1717 - GEM_BUG_ON(ppgtt->pd.base.ggtt_offset & 0x3f); 1718 - return ppgtt->pd.base.ggtt_offset << 10; 1719 - } 1720 - 1721 - static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt, 1722 - struct i915_request *rq) 1723 - { 1724 - struct intel_engine_cs *engine = rq->engine; 1725 - u32 *cs; 1726 - 1727 - /* NB: TLBs must be flushed and invalidated before a switch */ 1728 - cs = intel_ring_begin(rq, 6); 1729 - if (IS_ERR(cs)) 1730 - return PTR_ERR(cs); 1731 - 1732 - *cs++ = MI_LOAD_REGISTER_IMM(2); 1733 - *cs++ = i915_mmio_reg_offset(RING_PP_DIR_DCLV(engine)); 1734 - *cs++ = PP_DIR_DCLV_2G; 1735 - *cs++ = i915_mmio_reg_offset(RING_PP_DIR_BASE(engine)); 1736 - *cs++ = get_pd_offset(ppgtt); 1737 - *cs++ = MI_NOOP; 1738 - intel_ring_advance(rq, cs); 1739 - 1740 - return 0; 1741 - } 1742 - 1743 - static int gen6_mm_switch(struct i915_hw_ppgtt *ppgtt, 1744 - struct i915_request *rq) 1745 - { 1746 - struct intel_engine_cs *engine = rq->engine; 1747 - struct drm_i915_private *dev_priv = rq->i915; 1748 - 1749 - I915_WRITE(RING_PP_DIR_DCLV(engine), PP_DIR_DCLV_2G); 1750 - I915_WRITE(RING_PP_DIR_BASE(engine), get_pd_offset(ppgtt)); 1751 - return 0; 1752 - } 1753 - 1754 1715 static void gen8_ppgtt_enable(struct drm_i915_private *dev_priv) 1755 1716 { 1756 1717 struct intel_engine_cs *engine; ··· 1985 2024 ppgtt->vm.dma = &i915->drm.pdev->dev; 1986 2025 1987 2026 ppgtt->vm.pte_encode = ggtt->vm.pte_encode; 1988 - if (IS_GEN6(i915)) 1989 - ppgtt->switch_mm = gen6_mm_switch; 1990 - else if (IS_GEN7(i915)) 1991 - ppgtt->switch_mm = gen7_mm_switch; 1992 - else 1993 - BUG(); 1994 2027 1995 2028 err = gen6_ppgtt_alloc(ppgtt); 1996 2029 if (err)
-2
drivers/gpu/drm/i915/i915_gem_gtt.h
··· 406 406 407 407 gen6_pte_t __iomem *pd_addr; 408 408 409 - int (*switch_mm)(struct i915_hw_ppgtt *ppgtt, 410 - struct i915_request *rq); 411 409 void (*debug_dump)(struct i915_hw_ppgtt *ppgtt, struct seq_file *m); 412 410 }; 413 411
+2
drivers/gpu/drm/i915/i915_request.c
··· 817 817 /* Keep a second pin for the dual retirement along engine and ring */ 818 818 __intel_context_pin(ce); 819 819 820 + rq->infix = rq->ring->emit; /* end of header; start of user payload */ 821 + 820 822 /* Check that we didn't interrupt ourselves with a new request */ 821 823 GEM_BUG_ON(rq->timeline->seqno != rq->fence.seqno); 822 824 return rq;
+3
drivers/gpu/drm/i915/i915_request.h
··· 134 134 /** Position in the ring of the start of the request */ 135 135 u32 head; 136 136 137 + /** Position in the ring of the start of the user packets */ 138 + u32 infix; 139 + 137 140 /** 138 141 * Position in the ring of the start of the postfix. 139 142 * This is required to calculate the maximum available ring space
-33
drivers/gpu/drm/i915/i915_trace.h
··· 973 973 TP_ARGS(ctx) 974 974 ); 975 975 976 - /** 977 - * DOC: switch_mm tracepoint 978 - * 979 - * This tracepoint allows tracking of the mm switch, which is an important point 980 - * in the lifetime of the vm in the legacy submission path. This tracepoint is 981 - * called only if full ppgtt is enabled. 982 - */ 983 - TRACE_EVENT(switch_mm, 984 - TP_PROTO(struct intel_engine_cs *engine, struct i915_gem_context *to), 985 - 986 - TP_ARGS(engine, to), 987 - 988 - TP_STRUCT__entry( 989 - __field(u16, class) 990 - __field(u16, instance) 991 - __field(struct i915_gem_context *, to) 992 - __field(struct i915_address_space *, vm) 993 - __field(u32, dev) 994 - ), 995 - 996 - TP_fast_assign( 997 - __entry->class = engine->uabi_class; 998 - __entry->instance = engine->instance; 999 - __entry->to = to; 1000 - __entry->vm = to->ppgtt ? &to->ppgtt->vm : NULL; 1001 - __entry->dev = engine->i915->drm.primary->index; 1002 - ), 1003 - 1004 - TP_printk("dev=%u, engine=%u:%u, ctx=%p, ctx_vm=%p", 1005 - __entry->dev, __entry->class, __entry->instance, __entry->to, 1006 - __entry->vm) 1007 - ); 1008 - 1009 976 #endif /* _I915_TRACE_H_ */ 1010 977 1011 978 /* This part must be outside protection */
-3
drivers/gpu/drm/i915/intel_engine_cs.c
··· 1168 1168 1169 1169 lockdep_assert_held(&engine->i915->drm.struct_mutex); 1170 1170 1171 - engine->legacy_active_context = NULL; 1172 - engine->legacy_active_ppgtt = NULL; 1173 - 1174 1171 ce = fetch_and_zero(&engine->last_retired_context); 1175 1172 if (ce) 1176 1173 intel_context_unpin(ce);
+61 -64
drivers/gpu/drm/i915/intel_ringbuffer.c
··· 541 541 return i915_gem_find_active_request(engine); 542 542 } 543 543 544 - static void reset_ring(struct intel_engine_cs *engine, 545 - struct i915_request *request) 544 + static void skip_request(struct i915_request *rq) 546 545 { 547 - GEM_TRACE("%s seqno=%x\n", 548 - engine->name, request ? request->global_seqno : 0); 546 + void *vaddr = rq->ring->vaddr; 547 + u32 head; 548 + 549 + head = rq->infix; 550 + if (rq->postfix < head) { 551 + memset32(vaddr + head, MI_NOOP, 552 + (rq->ring->size - head) / sizeof(u32)); 553 + head = 0; 554 + } 555 + memset32(vaddr + head, MI_NOOP, (rq->postfix - head) / sizeof(u32)); 556 + } 557 + 558 + static void reset_ring(struct intel_engine_cs *engine, struct i915_request *rq) 559 + { 560 + GEM_TRACE("%s seqno=%x\n", engine->name, rq ? rq->global_seqno : 0); 549 561 550 562 /* 551 563 * RC6 must be prevented until the reset is complete and the engine ··· 581 569 * If the request was innocent, we try to replay the request with 582 570 * the restored context. 583 571 */ 584 - if (request) { 585 - struct drm_i915_private *dev_priv = request->i915; 586 - struct intel_context *ce = request->hw_context; 587 - struct i915_hw_ppgtt *ppgtt; 588 - 589 - if (ce->state) { 590 - I915_WRITE(CCID, 591 - i915_ggtt_offset(ce->state) | 592 - BIT(8) /* must be set! */ | 593 - CCID_EXTENDED_STATE_SAVE | 594 - CCID_EXTENDED_STATE_RESTORE | 595 - CCID_EN); 596 - } 597 - 598 - ppgtt = request->gem_context->ppgtt ?: engine->i915->mm.aliasing_ppgtt; 599 - if (ppgtt) { 600 - u32 pd_offset = ppgtt->pd.base.ggtt_offset << 10; 601 - 602 - I915_WRITE(RING_PP_DIR_DCLV(engine), PP_DIR_DCLV_2G); 603 - I915_WRITE(RING_PP_DIR_BASE(engine), pd_offset); 604 - 605 - /* Wait for the PD reload to complete */ 606 - if (intel_wait_for_register(dev_priv, 607 - RING_PP_DIR_BASE(engine), 608 - BIT(0), 0, 609 - 10)) 610 - DRM_ERROR("Wait for reload of ppgtt page-directory timed out\n"); 611 - 612 - ppgtt->pd_dirty_rings &= ~intel_engine_flag(engine); 613 - } 614 - 572 + if (rq) { 615 573 /* If the rq hung, jump to its breadcrumb and skip the batch */ 616 - if (request->fence.error == -EIO) 617 - request->ring->head = request->postfix; 618 - } else { 619 - engine->legacy_active_context = NULL; 620 - engine->legacy_active_ppgtt = NULL; 574 + rq->ring->head = intel_ring_wrap(rq->ring, rq->head); 575 + if (rq->fence.error == -EIO) 576 + skip_request(rq); 621 577 } 622 578 } 623 579 ··· 1426 1446 intel_ring_reset(engine->buffer, 0); 1427 1447 } 1428 1448 1449 + static int load_pd_dir(struct i915_request *rq, 1450 + const struct i915_hw_ppgtt *ppgtt) 1451 + { 1452 + const struct intel_engine_cs * const engine = rq->engine; 1453 + u32 *cs; 1454 + 1455 + cs = intel_ring_begin(rq, 6); 1456 + if (IS_ERR(cs)) 1457 + return PTR_ERR(cs); 1458 + 1459 + *cs++ = MI_LOAD_REGISTER_IMM(1); 1460 + *cs++ = i915_mmio_reg_offset(RING_PP_DIR_DCLV(engine)); 1461 + *cs++ = PP_DIR_DCLV_2G; 1462 + 1463 + *cs++ = MI_LOAD_REGISTER_IMM(1); 1464 + *cs++ = i915_mmio_reg_offset(RING_PP_DIR_BASE(engine)); 1465 + *cs++ = ppgtt->pd.base.ggtt_offset << 10; 1466 + 1467 + intel_ring_advance(rq, cs); 1468 + 1469 + return 0; 1470 + } 1471 + 1429 1472 static inline int mi_set_context(struct i915_request *rq, u32 flags) 1430 1473 { 1431 1474 struct drm_i915_private *i915 = rq->i915; ··· 1593 1590 static int switch_context(struct i915_request *rq) 1594 1591 { 1595 1592 struct intel_engine_cs *engine = rq->engine; 1596 - struct i915_gem_context *to_ctx = rq->gem_context; 1597 - struct i915_hw_ppgtt *to_mm = 1598 - to_ctx->ppgtt ?: rq->i915->mm.aliasing_ppgtt; 1599 - struct i915_gem_context *from_ctx = engine->legacy_active_context; 1600 - struct i915_hw_ppgtt *from_mm = engine->legacy_active_ppgtt; 1593 + struct i915_gem_context *ctx = rq->gem_context; 1594 + struct i915_hw_ppgtt *ppgtt = ctx->ppgtt ?: rq->i915->mm.aliasing_ppgtt; 1595 + unsigned int unwind_mm = 0; 1601 1596 u32 hw_flags = 0; 1602 1597 int ret, i; 1603 1598 1604 1599 lockdep_assert_held(&rq->i915->drm.struct_mutex); 1605 1600 GEM_BUG_ON(HAS_EXECLISTS(rq->i915)); 1606 1601 1607 - if (to_mm != from_mm || 1608 - (to_mm && intel_engine_flag(engine) & to_mm->pd_dirty_rings)) { 1609 - trace_switch_mm(engine, to_ctx); 1610 - ret = to_mm->switch_mm(to_mm, rq); 1602 + if (ppgtt) { 1603 + ret = load_pd_dir(rq, ppgtt); 1611 1604 if (ret) 1612 1605 goto err; 1613 1606 1614 - to_mm->pd_dirty_rings &= ~intel_engine_flag(engine); 1615 - engine->legacy_active_ppgtt = to_mm; 1616 - hw_flags = MI_FORCE_RESTORE; 1607 + if (intel_engine_flag(engine) & ppgtt->pd_dirty_rings) { 1608 + unwind_mm = intel_engine_flag(engine); 1609 + ppgtt->pd_dirty_rings &= ~unwind_mm; 1610 + hw_flags = MI_FORCE_RESTORE; 1611 + } 1617 1612 } 1618 1613 1619 - if (rq->hw_context->state && 1620 - (to_ctx != from_ctx || hw_flags & MI_FORCE_RESTORE)) { 1614 + if (rq->hw_context->state) { 1621 1615 GEM_BUG_ON(engine->id != RCS); 1622 1616 1623 1617 /* ··· 1624 1624 * as nothing actually executes using the kernel context; it 1625 1625 * is purely used for flushing user contexts. 1626 1626 */ 1627 - if (i915_gem_context_is_kernel(to_ctx)) 1627 + if (i915_gem_context_is_kernel(ctx)) 1628 1628 hw_flags = MI_RESTORE_INHIBIT; 1629 1629 1630 1630 ret = mi_set_context(rq, hw_flags); 1631 1631 if (ret) 1632 1632 goto err_mm; 1633 - 1634 - engine->legacy_active_context = to_ctx; 1635 1633 } 1636 1634 1637 - if (to_ctx->remap_slice) { 1635 + if (ctx->remap_slice) { 1638 1636 for (i = 0; i < MAX_L3_SLICES; i++) { 1639 - if (!(to_ctx->remap_slice & BIT(i))) 1637 + if (!(ctx->remap_slice & BIT(i))) 1640 1638 continue; 1641 1639 1642 1640 ret = remap_l3(rq, i); 1643 1641 if (ret) 1644 - goto err_ctx; 1642 + goto err_mm; 1645 1643 } 1646 1644 1647 - to_ctx->remap_slice = 0; 1645 + ctx->remap_slice = 0; 1648 1646 } 1649 1647 1650 1648 return 0; 1651 1649 1652 - err_ctx: 1653 - engine->legacy_active_context = from_ctx; 1654 1650 err_mm: 1655 - engine->legacy_active_ppgtt = from_mm; 1651 + if (unwind_mm) 1652 + ppgtt->pd_dirty_rings |= unwind_mm; 1656 1653 err: 1657 1654 return ret; 1658 1655 }
-9
drivers/gpu/drm/i915/intel_ringbuffer.h
··· 557 557 */ 558 558 struct intel_context *last_retired_context; 559 559 560 - /* We track the current MI_SET_CONTEXT in order to eliminate 561 - * redudant context switches. This presumes that requests are not 562 - * reordered! Or when they are the tracking is updated along with 563 - * the emission of individual requests into the legacy command 564 - * stream (ring). 565 - */ 566 - struct i915_gem_context *legacy_active_context; 567 - struct i915_hw_ppgtt *legacy_active_ppgtt; 568 - 569 560 /* status_notifier: list of callbacks for context-switch changes */ 570 561 struct atomic_notifier_head context_status_notifier; 571 562