Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/i915/gem: Implement legacy MI_STORE_DATA_IMM

The older arches did not convert MI_STORE_DATA_IMM to using the GTT, but
left them writing to a physical address. The notes suggest that the
primary reason would be so that the writes were cache coherent, as the
CPU cache uses physical tagging. As such we did not implement the
legacy variant of MI_STORE_DATA_IMM and so left all the relocations
synchronous -- but with a small function to convert from the vma address
into the physical address, we can implement asynchronous relocs on these
older arches, fixing up a few tests that require them.

In order to be able to test the legacy paths, refactor the gpu
relocations so that we can hook them up to a selftest.

v2: Use an array of offsets not enum labels for the selftest
v3: Refactor the common igt_hexdump()

Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/757
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200504140629.28240-1-chris@chris-wilson.co.uk

+337 -136
+131 -75
drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
··· 955 955 cache->needs_unfenced = INTEL_INFO(i915)->unfenced_needs_alignment; 956 956 cache->node.flags = 0; 957 957 cache->rq = NULL; 958 - cache->rq_size = 0; 958 + cache->target = NULL; 959 959 } 960 960 961 961 static inline void *unmask_page(unsigned long p) ··· 1325 1325 1326 1326 ce = intel_context_create(engine); 1327 1327 if (IS_ERR(ce)) { 1328 - err = PTR_ERR(rq); 1328 + err = PTR_ERR(ce); 1329 1329 goto err_unpin; 1330 1330 } 1331 1331 ··· 1376 1376 return err; 1377 1377 } 1378 1378 1379 + static bool reloc_can_use_engine(const struct intel_engine_cs *engine) 1380 + { 1381 + return engine->class != VIDEO_DECODE_CLASS || !IS_GEN(engine->i915, 6); 1382 + } 1383 + 1379 1384 static u32 *reloc_gpu(struct i915_execbuffer *eb, 1380 1385 struct i915_vma *vma, 1381 1386 unsigned int len) ··· 1392 1387 if (unlikely(!cache->rq)) { 1393 1388 struct intel_engine_cs *engine = eb->engine; 1394 1389 1395 - if (!intel_engine_can_store_dword(engine)) { 1390 + if (!reloc_can_use_engine(engine)) { 1396 1391 engine = engine->gt->engine_class[COPY_ENGINE_CLASS][0]; 1397 - if (!engine || !intel_engine_can_store_dword(engine)) 1392 + if (!engine) 1398 1393 return ERR_PTR(-ENODEV); 1399 1394 } 1400 1395 ··· 1440 1435 return !dma_resv_test_signaled_rcu(vma->resv, true); 1441 1436 } 1442 1437 1438 + static unsigned long vma_phys_addr(struct i915_vma *vma, u32 offset) 1439 + { 1440 + struct page *page; 1441 + unsigned long addr; 1442 + 1443 + GEM_BUG_ON(vma->pages != vma->obj->mm.pages); 1444 + 1445 + page = i915_gem_object_get_page(vma->obj, offset >> PAGE_SHIFT); 1446 + addr = PFN_PHYS(page_to_pfn(page)); 1447 + GEM_BUG_ON(overflows_type(addr, u32)); /* expected dma32 */ 1448 + 1449 + return addr + offset_in_page(offset); 1450 + } 1451 + 1452 + static bool __reloc_entry_gpu(struct i915_execbuffer *eb, 1453 + struct i915_vma *vma, 1454 + u64 offset, 1455 + u64 target_addr) 1456 + { 1457 + const unsigned int gen = eb->reloc_cache.gen; 1458 + unsigned int len; 1459 + u32 *batch; 1460 + u64 addr; 1461 + 1462 + if (gen >= 8) 1463 + len = offset & 7 ? 8 : 5; 1464 + else if (gen >= 4) 1465 + len = 4; 1466 + else 1467 + len = 3; 1468 + 1469 + batch = reloc_gpu(eb, vma, len); 1470 + if (IS_ERR(batch)) 1471 + return false; 1472 + 1473 + addr = gen8_canonical_addr(vma->node.start + offset); 1474 + if (gen >= 8) { 1475 + if (offset & 7) { 1476 + *batch++ = MI_STORE_DWORD_IMM_GEN4; 1477 + *batch++ = lower_32_bits(addr); 1478 + *batch++ = upper_32_bits(addr); 1479 + *batch++ = lower_32_bits(target_addr); 1480 + 1481 + addr = gen8_canonical_addr(addr + 4); 1482 + 1483 + *batch++ = MI_STORE_DWORD_IMM_GEN4; 1484 + *batch++ = lower_32_bits(addr); 1485 + *batch++ = upper_32_bits(addr); 1486 + *batch++ = upper_32_bits(target_addr); 1487 + } else { 1488 + *batch++ = (MI_STORE_DWORD_IMM_GEN4 | (1 << 21)) + 1; 1489 + *batch++ = lower_32_bits(addr); 1490 + *batch++ = upper_32_bits(addr); 1491 + *batch++ = lower_32_bits(target_addr); 1492 + *batch++ = upper_32_bits(target_addr); 1493 + } 1494 + } else if (gen >= 6) { 1495 + *batch++ = MI_STORE_DWORD_IMM_GEN4; 1496 + *batch++ = 0; 1497 + *batch++ = addr; 1498 + *batch++ = target_addr; 1499 + } else if (IS_I965G(eb->i915)) { 1500 + *batch++ = MI_STORE_DWORD_IMM_GEN4; 1501 + *batch++ = 0; 1502 + *batch++ = vma_phys_addr(vma, offset); 1503 + *batch++ = target_addr; 1504 + } else if (gen >= 4) { 1505 + *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 1506 + *batch++ = 0; 1507 + *batch++ = addr; 1508 + *batch++ = target_addr; 1509 + } else if (gen >= 3 && 1510 + !(IS_I915G(eb->i915) || IS_I915GM(eb->i915))) { 1511 + *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; 1512 + *batch++ = addr; 1513 + *batch++ = target_addr; 1514 + } else { 1515 + *batch++ = MI_STORE_DWORD_IMM; 1516 + *batch++ = vma_phys_addr(vma, offset); 1517 + *batch++ = target_addr; 1518 + } 1519 + 1520 + return true; 1521 + } 1522 + 1523 + static bool reloc_entry_gpu(struct i915_execbuffer *eb, 1524 + struct i915_vma *vma, 1525 + u64 offset, 1526 + u64 target_addr) 1527 + { 1528 + if (eb->reloc_cache.vaddr) 1529 + return false; 1530 + 1531 + if (!use_reloc_gpu(vma)) 1532 + return false; 1533 + 1534 + return __reloc_entry_gpu(eb, vma, offset, target_addr); 1535 + } 1536 + 1443 1537 static u64 1444 1538 relocate_entry(struct i915_vma *vma, 1445 1539 const struct drm_i915_gem_relocation_entry *reloc, 1446 1540 struct i915_execbuffer *eb, 1447 1541 const struct i915_vma *target) 1448 1542 { 1543 + u64 target_addr = relocation_target(reloc, target); 1449 1544 u64 offset = reloc->offset; 1450 - u64 target_offset = relocation_target(reloc, target); 1451 - bool wide = eb->reloc_cache.use_64bit_reloc; 1452 - void *vaddr; 1453 1545 1454 - if (!eb->reloc_cache.vaddr && use_reloc_gpu(vma)) { 1455 - const unsigned int gen = eb->reloc_cache.gen; 1456 - unsigned int len; 1457 - u32 *batch; 1458 - u64 addr; 1459 - 1460 - if (wide) 1461 - len = offset & 7 ? 8 : 5; 1462 - else if (gen >= 4) 1463 - len = 4; 1464 - else 1465 - len = 3; 1466 - 1467 - batch = reloc_gpu(eb, vma, len); 1468 - if (IS_ERR(batch)) 1469 - goto repeat; 1470 - 1471 - addr = gen8_canonical_addr(vma->node.start + offset); 1472 - if (wide) { 1473 - if (offset & 7) { 1474 - *batch++ = MI_STORE_DWORD_IMM_GEN4; 1475 - *batch++ = lower_32_bits(addr); 1476 - *batch++ = upper_32_bits(addr); 1477 - *batch++ = lower_32_bits(target_offset); 1478 - 1479 - addr = gen8_canonical_addr(addr + 4); 1480 - 1481 - *batch++ = MI_STORE_DWORD_IMM_GEN4; 1482 - *batch++ = lower_32_bits(addr); 1483 - *batch++ = upper_32_bits(addr); 1484 - *batch++ = upper_32_bits(target_offset); 1485 - } else { 1486 - *batch++ = (MI_STORE_DWORD_IMM_GEN4 | (1 << 21)) + 1; 1487 - *batch++ = lower_32_bits(addr); 1488 - *batch++ = upper_32_bits(addr); 1489 - *batch++ = lower_32_bits(target_offset); 1490 - *batch++ = upper_32_bits(target_offset); 1491 - } 1492 - } else if (gen >= 6) { 1493 - *batch++ = MI_STORE_DWORD_IMM_GEN4; 1494 - *batch++ = 0; 1495 - *batch++ = addr; 1496 - *batch++ = target_offset; 1497 - } else if (gen >= 4) { 1498 - *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 1499 - *batch++ = 0; 1500 - *batch++ = addr; 1501 - *batch++ = target_offset; 1502 - } else { 1503 - *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; 1504 - *batch++ = addr; 1505 - *batch++ = target_offset; 1506 - } 1507 - 1508 - goto out; 1509 - } 1546 + if (!reloc_entry_gpu(eb, vma, offset, target_addr)) { 1547 + bool wide = eb->reloc_cache.use_64bit_reloc; 1548 + void *vaddr; 1510 1549 1511 1550 repeat: 1512 - vaddr = reloc_vaddr(vma->obj, &eb->reloc_cache, offset >> PAGE_SHIFT); 1513 - if (IS_ERR(vaddr)) 1514 - return PTR_ERR(vaddr); 1551 + vaddr = reloc_vaddr(vma->obj, 1552 + &eb->reloc_cache, 1553 + offset >> PAGE_SHIFT); 1554 + if (IS_ERR(vaddr)) 1555 + return PTR_ERR(vaddr); 1515 1556 1516 - clflush_write32(vaddr + offset_in_page(offset), 1517 - lower_32_bits(target_offset), 1518 - eb->reloc_cache.vaddr); 1557 + GEM_BUG_ON(!IS_ALIGNED(offset, sizeof(u32))); 1558 + clflush_write32(vaddr + offset_in_page(offset), 1559 + lower_32_bits(target_addr), 1560 + eb->reloc_cache.vaddr); 1519 1561 1520 - if (wide) { 1521 - offset += sizeof(u32); 1522 - target_offset >>= 32; 1523 - wide = false; 1524 - goto repeat; 1562 + if (wide) { 1563 + offset += sizeof(u32); 1564 + target_addr >>= 32; 1565 + wide = false; 1566 + goto repeat; 1567 + } 1525 1568 } 1526 1569 1527 - out: 1528 1570 return target->node.start | UPDATE; 1529 1571 } 1530 1572 ··· 3074 3022 kvfree(exec2_list); 3075 3023 return err; 3076 3024 } 3025 + 3026 + #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 3027 + #include "selftests/i915_gem_execbuffer.c" 3028 + #endif
+1 -30
drivers/gpu/drm/i915/gem/selftests/i915_gem_client_blt.c
··· 302 302 i915_gem_object_flush_map(t->scratch.vma->obj); 303 303 } 304 304 305 - static void hexdump(const void *buf, size_t len) 306 - { 307 - const size_t rowsize = 8 * sizeof(u32); 308 - const void *prev = NULL; 309 - bool skip = false; 310 - size_t pos; 311 - 312 - for (pos = 0; pos < len; pos += rowsize) { 313 - char line[128]; 314 - 315 - if (prev && !memcmp(prev, buf + pos, rowsize)) { 316 - if (!skip) { 317 - pr_info("*\n"); 318 - skip = true; 319 - } 320 - continue; 321 - } 322 - 323 - WARN_ON_ONCE(hex_dump_to_buffer(buf + pos, len - pos, 324 - rowsize, sizeof(u32), 325 - line, sizeof(line), 326 - false) >= sizeof(line)); 327 - pr_info("[%04zx] %s\n", pos, line); 328 - 329 - prev = buf + pos; 330 - skip = false; 331 - } 332 - } 333 - 334 305 static u64 swizzle_bit(unsigned int bit, u64 offset) 335 306 { 336 307 return (offset & BIT_ULL(bit)) >> (bit - 6); ··· 397 426 pr_err("Invalid %s tiling detected at (%d, %d), start_val %x\n", 398 427 repr_tiling(buf->tiling), 399 428 x, y, buf->start_val); 400 - hexdump(vaddr, 4096); 429 + igt_hexdump(vaddr, 4096); 401 430 } 402 431 403 432 i915_gem_object_unpin_map(buf->vma->obj);
+171
drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c
··· 1 + // SPDX-License-Identifier: MIT 2 + /* 3 + * Copyright © 2020 Intel Corporation 4 + */ 5 + 6 + #include "i915_selftest.h" 7 + 8 + #include "gt/intel_engine_pm.h" 9 + #include "selftests/igt_flush_test.h" 10 + 11 + static u64 read_reloc(const u32 *map, int x, const u64 mask) 12 + { 13 + u64 reloc; 14 + 15 + memcpy(&reloc, &map[x], sizeof(reloc)); 16 + return reloc & mask; 17 + } 18 + 19 + static int __igt_gpu_reloc(struct i915_execbuffer *eb, 20 + struct drm_i915_gem_object *obj) 21 + { 22 + const unsigned int offsets[] = { 8, 3, 0 }; 23 + const u64 mask = 24 + GENMASK_ULL(eb->reloc_cache.use_64bit_reloc ? 63 : 31, 0); 25 + const u32 *map = page_mask_bits(obj->mm.mapping); 26 + struct i915_request *rq; 27 + struct i915_vma *vma; 28 + int err; 29 + int i; 30 + 31 + vma = i915_vma_instance(obj, eb->context->vm, NULL); 32 + if (IS_ERR(vma)) 33 + return PTR_ERR(vma); 34 + 35 + err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_HIGH); 36 + if (err) 37 + return err; 38 + 39 + /* 8-Byte aligned */ 40 + if (!__reloc_entry_gpu(eb, vma, 41 + offsets[0] * sizeof(u32), 42 + 0)) { 43 + err = -EIO; 44 + goto unpin_vma; 45 + } 46 + 47 + /* !8-Byte aligned */ 48 + if (!__reloc_entry_gpu(eb, vma, 49 + offsets[1] * sizeof(u32), 50 + 1)) { 51 + err = -EIO; 52 + goto unpin_vma; 53 + } 54 + 55 + /* Skip to the end of the cmd page */ 56 + i = PAGE_SIZE / sizeof(u32) - RELOC_TAIL - 1; 57 + i -= eb->reloc_cache.rq_size; 58 + memset32(eb->reloc_cache.rq_cmd + eb->reloc_cache.rq_size, 59 + MI_NOOP, i); 60 + eb->reloc_cache.rq_size += i; 61 + 62 + /* Force batch chaining */ 63 + if (!__reloc_entry_gpu(eb, vma, 64 + offsets[2] * sizeof(u32), 65 + 2)) { 66 + err = -EIO; 67 + goto unpin_vma; 68 + } 69 + 70 + GEM_BUG_ON(!eb->reloc_cache.rq); 71 + rq = i915_request_get(eb->reloc_cache.rq); 72 + err = reloc_gpu_flush(&eb->reloc_cache); 73 + if (err) 74 + goto put_rq; 75 + GEM_BUG_ON(eb->reloc_cache.rq); 76 + 77 + err = i915_gem_object_wait(obj, I915_WAIT_INTERRUPTIBLE, HZ / 2); 78 + if (err) { 79 + intel_gt_set_wedged(eb->engine->gt); 80 + goto put_rq; 81 + } 82 + 83 + if (!i915_request_completed(rq)) { 84 + pr_err("%s: did not wait for relocations!\n", eb->engine->name); 85 + err = -EINVAL; 86 + goto put_rq; 87 + } 88 + 89 + for (i = 0; i < ARRAY_SIZE(offsets); i++) { 90 + u64 reloc = read_reloc(map, offsets[i], mask); 91 + 92 + if (reloc != i) { 93 + pr_err("%s[%d]: map[%d] %llx != %x\n", 94 + eb->engine->name, i, offsets[i], reloc, i); 95 + err = -EINVAL; 96 + } 97 + } 98 + if (err) 99 + igt_hexdump(map, 4096); 100 + 101 + put_rq: 102 + i915_request_put(rq); 103 + unpin_vma: 104 + i915_vma_unpin(vma); 105 + return err; 106 + } 107 + 108 + static int igt_gpu_reloc(void *arg) 109 + { 110 + struct i915_execbuffer eb; 111 + struct drm_i915_gem_object *scratch; 112 + int err = 0; 113 + u32 *map; 114 + 115 + eb.i915 = arg; 116 + 117 + scratch = i915_gem_object_create_internal(eb.i915, 4096); 118 + if (IS_ERR(scratch)) 119 + return PTR_ERR(scratch); 120 + 121 + map = i915_gem_object_pin_map(scratch, I915_MAP_WC); 122 + if (IS_ERR(map)) { 123 + err = PTR_ERR(map); 124 + goto err_scratch; 125 + } 126 + 127 + for_each_uabi_engine(eb.engine, eb.i915) { 128 + reloc_cache_init(&eb.reloc_cache, eb.i915); 129 + memset(map, POISON_INUSE, 4096); 130 + 131 + intel_engine_pm_get(eb.engine); 132 + eb.context = intel_context_create(eb.engine); 133 + if (IS_ERR(eb.context)) { 134 + err = PTR_ERR(eb.context); 135 + goto err_pm; 136 + } 137 + 138 + err = intel_context_pin(eb.context); 139 + if (err) 140 + goto err_put; 141 + 142 + err = __igt_gpu_reloc(&eb, scratch); 143 + 144 + intel_context_unpin(eb.context); 145 + err_put: 146 + intel_context_put(eb.context); 147 + err_pm: 148 + intel_engine_pm_put(eb.engine); 149 + if (err) 150 + break; 151 + } 152 + 153 + if (igt_flush_test(eb.i915)) 154 + err = -EIO; 155 + 156 + err_scratch: 157 + i915_gem_object_put(scratch); 158 + return err; 159 + } 160 + 161 + int i915_gem_execbuffer_live_selftests(struct drm_i915_private *i915) 162 + { 163 + static const struct i915_subtest tests[] = { 164 + SUBTEST(igt_gpu_reloc), 165 + }; 166 + 167 + if (intel_gt_is_wedged(&i915->gt)) 168 + return 0; 169 + 170 + return i915_live_subtests(tests, i915); 171 + }
+2 -31
drivers/gpu/drm/i915/gt/selftest_lrc.c
··· 4342 4342 return intel_gt_live_subtests(tests, &i915->gt); 4343 4343 } 4344 4344 4345 - static void hexdump(const void *buf, size_t len) 4346 - { 4347 - const size_t rowsize = 8 * sizeof(u32); 4348 - const void *prev = NULL; 4349 - bool skip = false; 4350 - size_t pos; 4351 - 4352 - for (pos = 0; pos < len; pos += rowsize) { 4353 - char line[128]; 4354 - 4355 - if (prev && !memcmp(prev, buf + pos, rowsize)) { 4356 - if (!skip) { 4357 - pr_info("*\n"); 4358 - skip = true; 4359 - } 4360 - continue; 4361 - } 4362 - 4363 - WARN_ON_ONCE(hex_dump_to_buffer(buf + pos, len - pos, 4364 - rowsize, sizeof(u32), 4365 - line, sizeof(line), 4366 - false) >= sizeof(line)); 4367 - pr_info("[%04zx] %s\n", pos, line); 4368 - 4369 - prev = buf + pos; 4370 - skip = false; 4371 - } 4372 - } 4373 - 4374 4345 static int emit_semaphore_signal(struct intel_context *ce, void *slot) 4375 4346 { 4376 4347 const u32 offset = ··· 4489 4518 4490 4519 if (err) { 4491 4520 pr_info("%s: HW register image:\n", engine->name); 4492 - hexdump(hw, PAGE_SIZE); 4521 + igt_hexdump(hw, PAGE_SIZE); 4493 4522 4494 4523 pr_info("%s: SW register image:\n", engine->name); 4495 - hexdump(lrc, PAGE_SIZE); 4524 + igt_hexdump(lrc, PAGE_SIZE); 4496 4525 } 4497 4526 4498 4527 shmem_unpin_map(engine->default_state, hw);
+2
drivers/gpu/drm/i915/i915_selftest.h
··· 133 133 #define igt_timeout(t, fmt, ...) \ 134 134 __igt_timeout((t), KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) 135 135 136 + void igt_hexdump(const void *buf, size_t len); 137 + 136 138 #endif /* !__I915_SELFTEST_H__ */
+1
drivers/gpu/drm/i915/selftests/i915_live_selftests.h
··· 37 37 selftest(evict, i915_gem_evict_live_selftests) 38 38 selftest(hugepages, i915_gem_huge_page_live_selftests) 39 39 selftest(gem_contexts, i915_gem_context_live_selftests) 40 + selftest(gem_execbuf, i915_gem_execbuffer_live_selftests) 40 41 selftest(blt, i915_gem_object_blt_live_selftests) 41 42 selftest(client, i915_gem_client_blt_live_selftests) 42 43 selftest(reset, intel_reset_live_selftests)
+29
drivers/gpu/drm/i915/selftests/i915_selftest.c
··· 396 396 return true; 397 397 } 398 398 399 + void igt_hexdump(const void *buf, size_t len) 400 + { 401 + const size_t rowsize = 8 * sizeof(u32); 402 + const void *prev = NULL; 403 + bool skip = false; 404 + size_t pos; 405 + 406 + for (pos = 0; pos < len; pos += rowsize) { 407 + char line[128]; 408 + 409 + if (prev && !memcmp(prev, buf + pos, rowsize)) { 410 + if (!skip) { 411 + pr_info("*\n"); 412 + skip = true; 413 + } 414 + continue; 415 + } 416 + 417 + WARN_ON_ONCE(hex_dump_to_buffer(buf + pos, len - pos, 418 + rowsize, sizeof(u32), 419 + line, sizeof(line), 420 + false) >= sizeof(line)); 421 + pr_info("[%04zx] %s\n", pos, line); 422 + 423 + prev = buf + pos; 424 + skip = false; 425 + } 426 + } 427 + 399 428 module_param_named(st_random_seed, i915_selftest.random_seed, uint, 0400); 400 429 module_param_named(st_timeout, i915_selftest.timeout_ms, uint, 0400); 401 430 module_param_named(st_filter, i915_selftest.filter, charp, 0400);