drm/i915: Add mechanism to submit a context WA on ring submission

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

This patch adds framework to submit an arbitrary batchbuffer on each
context switch to clear residual state for render engine on Gen7/7.5
devices.

The idea of always emitting the context and vm setup around each request
is primary to make reset recovery easy, and not require rewriting the
ringbuffer. As each request would set up its own context, leaving it to
the HW to notice and elide no-op context switches, we could restart the
ring at any point, and reorder the requests freely.

However, to avoid emitting clear_residuals() between consecutive requests
in the ringbuffer of the same context, we do want to track the current
context in the ring. In doing so, we need to be careful to only record a
context switch when we are sure the next request will be emitted.

This security mitigation change does not trigger any performance
regression. Performance is on par with current mainline/drm-tip.

v2: Update vm_alias params to point to correct address space "vm" due to
changes made in the patch "f21613797bae98773"

v3-v4: none

Signed-off-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Signed-off-by: Prathap Kumar Valsan <prathap.kumar.valsan@intel.com>
Signed-off-by: Akeem G Abodunrin <akeem.g.abodunrin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Balestrieri Francesco <francesco.balestrieri@intel.com>
Cc: Bloomfield Jon <jon.bloomfield@intel.com>
Cc: Dutt Sudeep <sudeep.dutt@intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20200306000957.2836150-1-chris@chris-wilson.co.uk

authored by

Mika Kuoppala and committed by

Chris Wilson 6 years ago ee2413ee 81dcef4c

+425 -4

3 changed files

expand all

drivers

gpu

drm

i915

intel_ring_submission.c

selftest_ring_submission.c

selftests

i915_live_selftests.h

+134 -4

drivers/gpu/drm/i915/gt/intel_ring_submission.c

··· 1356 1356 return rq->engine->emit_flush(rq, EMIT_FLUSH); 1357 1357 } 1358 1358 1359 - static inline int mi_set_context(struct i915_request *rq, u32 flags) 1359 + static inline int mi_set_context(struct i915_request *rq, 1360 + struct intel_context *ce, 1361 + u32 flags) 1360 1362 { 1361 1363 struct drm_i915_private *i915 = rq->i915; 1362 1364 struct intel_engine_cs *engine = rq->engine; ··· 1433 1431 1434 1432 *cs++ = MI_NOOP; 1435 1433 *cs++ = MI_SET_CONTEXT; 1436 - *cs++ = i915_ggtt_offset(rq->context->state) | flags; 1434 + *cs++ = i915_ggtt_offset(ce->state) | flags; 1437 1435 /* 1438 1436 * w/a: MI_SET_CONTEXT must always be followed by MI_NOOP 1439 1437 * WaMiSetContext_Hang:snb,ivb,vlv ··· 1548 1546 return rq->engine->emit_flush(rq, EMIT_INVALIDATE); 1549 1547 } 1550 1548 1549 + static int clear_residuals(struct i915_request *rq) 1550 + { 1551 + struct intel_engine_cs *engine = rq->engine; 1552 + int ret; 1553 + 1554 + ret = switch_mm(rq, vm_alias(engine->kernel_context->vm)); 1555 + if (ret) 1556 + return ret; 1557 + 1558 + if (engine->kernel_context->state) { 1559 + ret = mi_set_context(rq, 1560 + engine->kernel_context, 1561 + MI_MM_SPACE_GTT | MI_RESTORE_INHIBIT); 1562 + if (ret) 1563 + return ret; 1564 + } 1565 + 1566 + ret = engine->emit_bb_start(rq, 1567 + engine->wa_ctx.vma->node.start, 0, 1568 + 0); 1569 + if (ret) 1570 + return ret; 1571 + 1572 + ret = engine->emit_flush(rq, EMIT_FLUSH); 1573 + if (ret) 1574 + return ret; 1575 + 1576 + /* Always invalidate before the next switch_mm() */ 1577 + return engine->emit_flush(rq, EMIT_INVALIDATE); 1578 + } 1579 + 1551 1580 static int switch_context(struct i915_request *rq) 1552 1581 { 1582 + struct intel_engine_cs *engine = rq->engine; 1553 1583 struct intel_context *ce = rq->context; 1584 + void **residuals = NULL; 1554 1585 int ret; 1555 1586 1556 1587 GEM_BUG_ON(HAS_EXECLISTS(rq->i915)); 1588 + 1589 + if (engine->wa_ctx.vma && ce != engine->kernel_context) { 1590 + if (engine->wa_ctx.vma->private != ce) { 1591 + ret = clear_residuals(rq); 1592 + if (ret) 1593 + return ret; 1594 + 1595 + residuals = &engine->wa_ctx.vma->private; 1596 + } 1597 + } 1557 1598 1558 1599 ret = switch_mm(rq, vm_alias(ce->vm)); 1559 1600 if (ret) ··· 1605 1560 if (ce->state) { 1606 1561 u32 flags; 1607 1562 1608 - GEM_BUG_ON(rq->engine->id != RCS0); 1563 + GEM_BUG_ON(engine->id != RCS0); 1609 1564 1610 1565 /* For resource streamer on HSW+ and power context elsewhere */ 1611 1566 BUILD_BUG_ON(HSW_MI_RS_SAVE_STATE_EN != MI_SAVE_EXT_STATE_EN); ··· 1617 1572 else 1618 1573 flags |= MI_RESTORE_INHIBIT; 1619 1574 1620 - ret = mi_set_context(rq, flags); 1575 + ret = mi_set_context(rq, ce, flags); 1621 1576 if (ret) 1622 1577 return ret; 1623 1578 } ··· 1625 1580 ret = remap_l3(rq); 1626 1581 if (ret) 1627 1582 return ret; 1583 + 1584 + /* 1585 + * Now past the point of no return, this request _will_ be emitted. 1586 + * 1587 + * Or at least this preamble will be emitted, the request may be 1588 + * interrupted prior to submitting the user payload. If so, we 1589 + * still submit the "empty" request in order to preserve global 1590 + * state tracking such as this, our tracking of the current 1591 + * dirty context. 1592 + */ 1593 + if (residuals) { 1594 + intel_context_put(*residuals); 1595 + *residuals = intel_context_get(ce); 1596 + } 1628 1597 1629 1598 return 0; 1630 1599 } ··· 1824 1765 1825 1766 intel_engine_cleanup_common(engine); 1826 1767 1768 + if (engine->wa_ctx.vma) { 1769 + intel_context_put(engine->wa_ctx.vma->private); 1770 + i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0); 1771 + } 1772 + 1827 1773 intel_ring_unpin(engine->legacy.ring); 1828 1774 intel_ring_put(engine->legacy.ring); 1829 1775 ··· 1976 1912 engine->emit_fini_breadcrumb = gen7_xcs_emit_breadcrumb; 1977 1913 } 1978 1914 1915 + static int gen7_ctx_switch_bb_setup(struct intel_engine_cs * const engine, 1916 + struct i915_vma * const vma) 1917 + { 1918 + return 0; 1919 + } 1920 + 1921 + static int gen7_ctx_switch_bb_init(struct intel_engine_cs *engine) 1922 + { 1923 + struct drm_i915_gem_object *obj; 1924 + struct i915_vma *vma; 1925 + int size; 1926 + int err; 1927 + 1928 + size = gen7_ctx_switch_bb_setup(engine, NULL /* probe size */); 1929 + if (size <= 0) 1930 + return size; 1931 + 1932 + size = ALIGN(size, PAGE_SIZE); 1933 + obj = i915_gem_object_create_internal(engine->i915, size); 1934 + if (IS_ERR(obj)) 1935 + return PTR_ERR(obj); 1936 + 1937 + vma = i915_vma_instance(obj, engine->gt->vm, NULL); 1938 + if (IS_ERR(vma)) { 1939 + err = PTR_ERR(vma); 1940 + goto err_obj; 1941 + } 1942 + 1943 + vma->private = intel_context_create(engine); /* dummy residuals */ 1944 + if (IS_ERR(vma->private)) { 1945 + err = PTR_ERR(vma->private); 1946 + goto err_obj; 1947 + } 1948 + 1949 + err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_HIGH); 1950 + if (err) 1951 + goto err_private; 1952 + 1953 + err = gen7_ctx_switch_bb_setup(engine, vma); 1954 + if (err) 1955 + goto err_unpin; 1956 + 1957 + engine->wa_ctx.vma = vma; 1958 + return 0; 1959 + 1960 + err_unpin: 1961 + i915_vma_unpin(vma); 1962 + err_private: 1963 + intel_context_put(vma->private); 1964 + err_obj: 1965 + i915_gem_object_put(obj); 1966 + return err; 1967 + } 1968 + 1979 1969 int intel_ring_submission_setup(struct intel_engine_cs *engine) 1980 1970 { 1981 1971 struct intel_timeline *timeline; ··· 2083 1965 2084 1966 GEM_BUG_ON(timeline->hwsp_ggtt != engine->status_page.vma); 2085 1967 1968 + if (IS_GEN(engine->i915, 7) && engine->class == RENDER_CLASS) { 1969 + err = gen7_ctx_switch_bb_init(engine); 1970 + if (err) 1971 + goto err_ring_unpin; 1972 + } 1973 + 2086 1974 /* Finally, take ownership and responsibility for cleanup! */ 2087 1975 engine->release = ring_release; 2088 1976 2089 1977 return 0; 2090 1978 1979 + err_ring_unpin: 1980 + intel_ring_unpin(ring); 2091 1981 err_ring: 2092 1982 intel_ring_put(ring); 2093 1983 err_timeline_unpin: ··· 2106 1980 intel_engine_cleanup_common(engine); 2107 1981 return err; 2108 1982 } 1983 + 1984 + #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1985 + #include "selftest_ring_submission.c" 1986 + #endif

+290

drivers/gpu/drm/i915/gt/selftest_ring_submission.c

··· 1 + // SPDX-License-Identifier: MIT 2 + /* 3 + * Copyright © 2020 Intel Corporation 4 + */ 5 + 6 + #include "intel_engine_pm.h" 7 + #include "selftests/igt_flush_test.h" 8 + 9 + static struct i915_vma *create_wally(struct intel_engine_cs *engine) 10 + { 11 + struct drm_i915_gem_object *obj; 12 + struct i915_vma *vma; 13 + u32 *cs; 14 + int err; 15 + 16 + obj = i915_gem_object_create_internal(engine->i915, 4096); 17 + if (IS_ERR(obj)) 18 + return ERR_CAST(obj); 19 + 20 + vma = i915_vma_instance(obj, engine->gt->vm, NULL); 21 + if (IS_ERR(vma)) { 22 + i915_gem_object_put(obj); 23 + return vma; 24 + } 25 + 26 + err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_HIGH); 27 + if (err) { 28 + i915_gem_object_put(obj); 29 + return ERR_PTR(err); 30 + } 31 + 32 + cs = i915_gem_object_pin_map(obj, I915_MAP_WC); 33 + if (IS_ERR(cs)) { 34 + i915_gem_object_put(obj); 35 + return ERR_CAST(cs); 36 + } 37 + 38 + if (INTEL_GEN(engine->i915) >= 6) { 39 + *cs++ = MI_STORE_DWORD_IMM_GEN4; 40 + *cs++ = 0; 41 + } else if (INTEL_GEN(engine->i915) >= 4) { 42 + *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 43 + *cs++ = 0; 44 + } else { 45 + *cs++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; 46 + } 47 + *cs++ = vma->node.start + 4000; 48 + *cs++ = STACK_MAGIC; 49 + 50 + *cs++ = MI_BATCH_BUFFER_END; 51 + i915_gem_object_unpin_map(obj); 52 + 53 + vma->private = intel_context_create(engine); /* dummy residuals */ 54 + if (IS_ERR(vma->private)) { 55 + vma = ERR_CAST(vma->private); 56 + i915_gem_object_put(obj); 57 + } 58 + 59 + return vma; 60 + } 61 + 62 + static int context_sync(struct intel_context *ce) 63 + { 64 + struct i915_request *rq; 65 + int err = 0; 66 + 67 + rq = intel_context_create_request(ce); 68 + if (IS_ERR(rq)) 69 + return PTR_ERR(rq); 70 + 71 + i915_request_get(rq); 72 + i915_request_add(rq); 73 + 74 + if (i915_request_wait(rq, 0, HZ / 5) < 0) 75 + err = -ETIME; 76 + i915_request_put(rq); 77 + 78 + return err; 79 + } 80 + 81 + static int new_context_sync(struct intel_engine_cs *engine) 82 + { 83 + struct intel_context *ce; 84 + int err; 85 + 86 + ce = intel_context_create(engine); 87 + if (IS_ERR(ce)) 88 + return PTR_ERR(ce); 89 + 90 + err = context_sync(ce); 91 + intel_context_put(ce); 92 + 93 + return err; 94 + } 95 + 96 + static int mixed_contexts_sync(struct intel_engine_cs *engine, u32 *result) 97 + { 98 + int pass; 99 + int err; 100 + 101 + for (pass = 0; pass < 2; pass++) { 102 + WRITE_ONCE(*result, 0); 103 + err = context_sync(engine->kernel_context); 104 + if (err || READ_ONCE(*result)) { 105 + if (!err) { 106 + pr_err("pass[%d] wa_bb emitted for the kernel context\n", 107 + pass); 108 + err = -EINVAL; 109 + } 110 + return err; 111 + } 112 + 113 + WRITE_ONCE(*result, 0); 114 + err = new_context_sync(engine); 115 + if (READ_ONCE(*result) != STACK_MAGIC) { 116 + if (!err) { 117 + pr_err("pass[%d] wa_bb *NOT* emitted after the kernel context\n", 118 + pass); 119 + err = -EINVAL; 120 + } 121 + return err; 122 + } 123 + 124 + WRITE_ONCE(*result, 0); 125 + err = new_context_sync(engine); 126 + if (READ_ONCE(*result) != STACK_MAGIC) { 127 + if (!err) { 128 + pr_err("pass[%d] wa_bb *NOT* emitted for the user context switch\n", 129 + pass); 130 + err = -EINVAL; 131 + } 132 + return err; 133 + } 134 + } 135 + 136 + return 0; 137 + } 138 + 139 + static int double_context_sync_00(struct intel_engine_cs *engine, u32 *result) 140 + { 141 + struct intel_context *ce; 142 + int err, i; 143 + 144 + ce = intel_context_create(engine); 145 + if (IS_ERR(ce)) 146 + return PTR_ERR(ce); 147 + 148 + for (i = 0; i < 2; i++) { 149 + WRITE_ONCE(*result, 0); 150 + err = context_sync(ce); 151 + if (err) 152 + break; 153 + } 154 + intel_context_put(ce); 155 + if (err) 156 + return err; 157 + 158 + if (READ_ONCE(*result)) { 159 + pr_err("wa_bb emitted between the same user context\n"); 160 + return -EINVAL; 161 + } 162 + 163 + return 0; 164 + } 165 + 166 + static int kernel_context_sync_00(struct intel_engine_cs *engine, u32 *result) 167 + { 168 + struct intel_context *ce; 169 + int err, i; 170 + 171 + ce = intel_context_create(engine); 172 + if (IS_ERR(ce)) 173 + return PTR_ERR(ce); 174 + 175 + for (i = 0; i < 2; i++) { 176 + WRITE_ONCE(*result, 0); 177 + err = context_sync(ce); 178 + if (err) 179 + break; 180 + 181 + err = context_sync(engine->kernel_context); 182 + if (err) 183 + break; 184 + } 185 + intel_context_put(ce); 186 + if (err) 187 + return err; 188 + 189 + if (READ_ONCE(*result)) { 190 + pr_err("wa_bb emitted between the same user context [with intervening kernel]\n"); 191 + return -EINVAL; 192 + } 193 + 194 + return 0; 195 + } 196 + 197 + static int __live_ctx_switch_wa(struct intel_engine_cs *engine) 198 + { 199 + struct i915_vma *bb; 200 + u32 *result; 201 + int err; 202 + 203 + bb = create_wally(engine); 204 + if (IS_ERR(bb)) 205 + return PTR_ERR(bb); 206 + 207 + result = i915_gem_object_pin_map(bb->obj, I915_MAP_WC); 208 + if (IS_ERR(result)) { 209 + intel_context_put(bb->private); 210 + i915_vma_unpin_and_release(&bb, 0); 211 + return PTR_ERR(result); 212 + } 213 + result += 1000; 214 + 215 + engine->wa_ctx.vma = bb; 216 + 217 + err = mixed_contexts_sync(engine, result); 218 + if (err) 219 + goto out; 220 + 221 + err = double_context_sync_00(engine, result); 222 + if (err) 223 + goto out; 224 + 225 + err = kernel_context_sync_00(engine, result); 226 + if (err) 227 + goto out; 228 + 229 + out: 230 + intel_context_put(engine->wa_ctx.vma->private); 231 + i915_vma_unpin_and_release(&engine->wa_ctx.vma, I915_VMA_RELEASE_MAP); 232 + return err; 233 + } 234 + 235 + static int live_ctx_switch_wa(void *arg) 236 + { 237 + struct intel_gt *gt = arg; 238 + struct intel_engine_cs *engine; 239 + enum intel_engine_id id; 240 + 241 + /* 242 + * Exercise the inter-context wa batch. 243 + * 244 + * Between each user context we run a wa batch, and since it may 245 + * have implications for user visible state, we have to check that 246 + * we do actually execute it. 247 + * 248 + * The trick we use is to replace the normal wa batch with a custom 249 + * one that writes to a marker within it, and we can then look for 250 + * that marker to confirm if the batch was run when we expect it, 251 + * and equally important it was wasn't run when we don't! 252 + */ 253 + 254 + for_each_engine(engine, gt, id) { 255 + struct i915_vma *saved_wa; 256 + int err; 257 + 258 + if (!intel_engine_can_store_dword(engine)) 259 + continue; 260 + 261 + if (IS_GEN_RANGE(gt->i915, 4, 5)) 262 + continue; /* MI_STORE_DWORD is privileged! */ 263 + 264 + saved_wa = fetch_and_zero(&engine->wa_ctx.vma); 265 + 266 + intel_engine_pm_get(engine); 267 + err = __live_ctx_switch_wa(engine); 268 + intel_engine_pm_put(engine); 269 + if (igt_flush_test(gt->i915)) 270 + err = -EIO; 271 + 272 + engine->wa_ctx.vma = saved_wa; 273 + if (err) 274 + return err; 275 + } 276 + 277 + return 0; 278 + } 279 + 280 + int intel_ring_submission_live_selftests(struct drm_i915_private *i915) 281 + { 282 + static const struct i915_subtest tests[] = { 283 + SUBTEST(live_ctx_switch_wa), 284 + }; 285 + 286 + if (HAS_EXECLISTS(i915)) 287 + return 0; 288 + 289 + return intel_gt_live_subtests(tests, &i915->gt); 290 + }

drivers/gpu/drm/i915/selftests/i915_live_selftests.h

··· 43 43 selftest(memory_region, intel_memory_region_live_selftests) 44 44 selftest(hangcheck, intel_hangcheck_live_selftests) 45 45 selftest(execlists, intel_execlists_live_selftests) 46 + selftest(ring_submission, intel_ring_submission_live_selftests) 46 47 selftest(perf, i915_perf_live_selftests) 47 48 /* Here be dragons: keep last to run last! */ 48 49 selftest(late_gt_pm, intel_gt_pm_late_selftests)