drm/i915/gem: Async GPU relocations only

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Reduce the 3 relocation paths down to the single path that accommodates
all. The primary motivation for this is to guard the relocations with a
natural fence (derived from the i915_request used to write the
relocation from the GPU).

The tradeoff in using async gpu relocations is that it increases latency
over using direct CPU relocations, for the cases where the target is
idle and accessible by the CPU. The benefit is greatly reduced lock
contention and improved concurrency by pipelining.

Note that forcing the async gpu relocations does reveal a few issues
they have. Firstly, is that they are visible as writes to gem_busy,
causing to mark some buffers are being to written to by the GPU even
though userspace only reads. Secondly is that, in combination with the
cmdparser, they can cause priority inversions. This should be the case
where the work is being put into a common workqueue losing our priority
information and so being executed in FIFO from the worker, denying us
the opportunity to reorder the requests afterwards.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200604211457.19696-1-chris@chris-wilson.co.uk

Chris Wilson 5 years ago 9e0f9464 2d387995

+26 -288

2 changed files

expand all

drivers

gpu

drm

i915

gem

i915_gem_execbuffer.c

selftests

i915_gem_execbuffer.c

+20 -273

drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c

··· 45 45 struct eb_vma vma[]; 46 46 }; 47 47 48 - enum { 49 - FORCE_CPU_RELOC = 1, 50 - FORCE_GTT_RELOC, 51 - FORCE_GPU_RELOC, 52 - #define DBG_FORCE_RELOC 0 /* choose one of the above! */ 53 - }; 54 - 55 48 #define __EXEC_OBJECT_HAS_PIN BIT(31) 56 49 #define __EXEC_OBJECT_HAS_FENCE BIT(30) 57 50 #define __EXEC_OBJECT_NEEDS_MAP BIT(29) ··· 253 260 */ 254 261 struct reloc_cache { 255 262 struct drm_mm_node node; /** temporary GTT binding */ 256 - unsigned long vaddr; /** Current kmap address */ 257 - unsigned long page; /** Currently mapped page index */ 258 263 unsigned int gen; /** Cached value of INTEL_GEN */ 259 264 bool use_64bit_reloc : 1; 260 265 bool has_llc : 1; ··· 596 605 } 597 606 } 598 607 599 - static inline int use_cpu_reloc(const struct reloc_cache *cache, 600 - const struct drm_i915_gem_object *obj) 601 - { 602 - if (!i915_gem_object_has_struct_page(obj)) 603 - return false; 604 - 605 - if (DBG_FORCE_RELOC == FORCE_CPU_RELOC) 606 - return true; 607 - 608 - if (DBG_FORCE_RELOC == FORCE_GTT_RELOC) 609 - return false; 610 - 611 - return (cache->has_llc || 612 - obj->cache_dirty || 613 - obj->cache_level != I915_CACHE_NONE); 614 - } 615 - 616 608 static int eb_reserve_vma(const struct i915_execbuffer *eb, 617 609 struct eb_vma *ev, 618 610 u64 pin_flags) ··· 919 945 static void reloc_cache_init(struct reloc_cache *cache, 920 946 struct drm_i915_private *i915) 921 947 { 922 - cache->page = -1; 923 - cache->vaddr = 0; 924 948 /* Must be a variable in the struct to allow GCC to unroll. */ 925 949 cache->gen = INTEL_GEN(i915); 926 950 cache->has_llc = HAS_LLC(i915); ··· 1059 1087 i915_request_add(rq); 1060 1088 1061 1089 return err; 1062 - } 1063 - 1064 - static void reloc_cache_reset(struct reloc_cache *cache) 1065 - { 1066 - void *vaddr; 1067 - 1068 - if (!cache->vaddr) 1069 - return; 1070 - 1071 - vaddr = unmask_page(cache->vaddr); 1072 - if (cache->vaddr & KMAP) { 1073 - if (cache->vaddr & CLFLUSH_AFTER) 1074 - mb(); 1075 - 1076 - kunmap_atomic(vaddr); 1077 - i915_gem_object_finish_access((struct drm_i915_gem_object *)cache->node.mm); 1078 - } else { 1079 - struct i915_ggtt *ggtt = cache_to_ggtt(cache); 1080 - 1081 - intel_gt_flush_ggtt_writes(ggtt->vm.gt); 1082 - io_mapping_unmap_atomic((void __iomem *)vaddr); 1083 - 1084 - if (drm_mm_node_allocated(&cache->node)) { 1085 - ggtt->vm.clear_range(&ggtt->vm, 1086 - cache->node.start, 1087 - cache->node.size); 1088 - mutex_lock(&ggtt->vm.mutex); 1089 - drm_mm_remove_node(&cache->node); 1090 - mutex_unlock(&ggtt->vm.mutex); 1091 - } else { 1092 - i915_vma_unpin((struct i915_vma *)cache->node.mm); 1093 - } 1094 - } 1095 - 1096 - cache->vaddr = 0; 1097 - cache->page = -1; 1098 - } 1099 - 1100 - static void *reloc_kmap(struct drm_i915_gem_object *obj, 1101 - struct reloc_cache *cache, 1102 - unsigned long page) 1103 - { 1104 - void *vaddr; 1105 - 1106 - if (cache->vaddr) { 1107 - kunmap_atomic(unmask_page(cache->vaddr)); 1108 - } else { 1109 - unsigned int flushes; 1110 - int err; 1111 - 1112 - err = i915_gem_object_prepare_write(obj, &flushes); 1113 - if (err) 1114 - return ERR_PTR(err); 1115 - 1116 - BUILD_BUG_ON(KMAP & CLFLUSH_FLAGS); 1117 - BUILD_BUG_ON((KMAP | CLFLUSH_FLAGS) & PAGE_MASK); 1118 - 1119 - cache->vaddr = flushes | KMAP; 1120 - cache->node.mm = (void *)obj; 1121 - if (flushes) 1122 - mb(); 1123 - } 1124 - 1125 - vaddr = kmap_atomic(i915_gem_object_get_dirty_page(obj, page)); 1126 - cache->vaddr = unmask_flags(cache->vaddr) | (unsigned long)vaddr; 1127 - cache->page = page; 1128 - 1129 - return vaddr; 1130 - } 1131 - 1132 - static void *reloc_iomap(struct drm_i915_gem_object *obj, 1133 - struct reloc_cache *cache, 1134 - unsigned long page) 1135 - { 1136 - struct i915_ggtt *ggtt = cache_to_ggtt(cache); 1137 - unsigned long offset; 1138 - void *vaddr; 1139 - 1140 - if (cache->vaddr) { 1141 - intel_gt_flush_ggtt_writes(ggtt->vm.gt); 1142 - io_mapping_unmap_atomic((void __force __iomem *) unmask_page(cache->vaddr)); 1143 - } else { 1144 - struct i915_vma *vma; 1145 - int err; 1146 - 1147 - if (i915_gem_object_is_tiled(obj)) 1148 - return ERR_PTR(-EINVAL); 1149 - 1150 - if (use_cpu_reloc(cache, obj)) 1151 - return NULL; 1152 - 1153 - i915_gem_object_lock(obj); 1154 - err = i915_gem_object_set_to_gtt_domain(obj, true); 1155 - i915_gem_object_unlock(obj); 1156 - if (err) 1157 - return ERR_PTR(err); 1158 - 1159 - vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, 1160 - PIN_MAPPABLE | 1161 - PIN_NONBLOCK /* NOWARN */ | 1162 - PIN_NOEVICT); 1163 - if (IS_ERR(vma)) { 1164 - memset(&cache->node, 0, sizeof(cache->node)); 1165 - mutex_lock(&ggtt->vm.mutex); 1166 - err = drm_mm_insert_node_in_range 1167 - (&ggtt->vm.mm, &cache->node, 1168 - PAGE_SIZE, 0, I915_COLOR_UNEVICTABLE, 1169 - 0, ggtt->mappable_end, 1170 - DRM_MM_INSERT_LOW); 1171 - mutex_unlock(&ggtt->vm.mutex); 1172 - if (err) /* no inactive aperture space, use cpu reloc */ 1173 - return NULL; 1174 - } else { 1175 - cache->node.start = vma->node.start; 1176 - cache->node.mm = (void *)vma; 1177 - } 1178 - } 1179 - 1180 - offset = cache->node.start; 1181 - if (drm_mm_node_allocated(&cache->node)) { 1182 - ggtt->vm.insert_page(&ggtt->vm, 1183 - i915_gem_object_get_dma_address(obj, page), 1184 - offset, I915_CACHE_NONE, 0); 1185 - } else { 1186 - offset += page << PAGE_SHIFT; 1187 - } 1188 - 1189 - vaddr = (void __force *)io_mapping_map_atomic_wc(&ggtt->iomap, 1190 - offset); 1191 - cache->page = page; 1192 - cache->vaddr = (unsigned long)vaddr; 1193 - 1194 - return vaddr; 1195 - } 1196 - 1197 - static void *reloc_vaddr(struct drm_i915_gem_object *obj, 1198 - struct reloc_cache *cache, 1199 - unsigned long page) 1200 - { 1201 - void *vaddr; 1202 - 1203 - if (cache->page == page) { 1204 - vaddr = unmask_page(cache->vaddr); 1205 - } else { 1206 - vaddr = NULL; 1207 - if ((cache->vaddr & KMAP) == 0) 1208 - vaddr = reloc_iomap(obj, cache, page); 1209 - if (!vaddr) 1210 - vaddr = reloc_kmap(obj, cache, page); 1211 - } 1212 - 1213 - return vaddr; 1214 - } 1215 - 1216 - static void clflush_write32(u32 *addr, u32 value, unsigned int flushes) 1217 - { 1218 - if (unlikely(flushes & (CLFLUSH_BEFORE | CLFLUSH_AFTER))) { 1219 - if (flushes & CLFLUSH_BEFORE) { 1220 - clflushopt(addr); 1221 - mb(); 1222 - } 1223 - 1224 - *addr = value; 1225 - 1226 - /* 1227 - * Writes to the same cacheline are serialised by the CPU 1228 - * (including clflush). On the write path, we only require 1229 - * that it hits memory in an orderly fashion and place 1230 - * mb barriers at the start and end of the relocation phase 1231 - * to ensure ordering of clflush wrt to the system. 1232 - */ 1233 - if (flushes & CLFLUSH_AFTER) 1234 - clflushopt(addr); 1235 - } else 1236 - *addr = value; 1237 1090 } 1238 1091 1239 1092 static int reloc_move_to_gpu(struct i915_request *rq, struct i915_vma *vma) ··· 1226 1429 return cmd; 1227 1430 } 1228 1431 1229 - static inline bool use_reloc_gpu(struct i915_vma *vma) 1230 - { 1231 - if (DBG_FORCE_RELOC == FORCE_GPU_RELOC) 1232 - return true; 1233 - 1234 - if (DBG_FORCE_RELOC) 1235 - return false; 1236 - 1237 - return !dma_resv_test_signaled_rcu(vma->resv, true); 1238 - } 1239 - 1240 1432 static unsigned long vma_phys_addr(struct i915_vma *vma, u32 offset) 1241 1433 { 1242 1434 struct page *page; ··· 1240 1454 return addr + offset_in_page(offset); 1241 1455 } 1242 1456 1243 - static bool __reloc_entry_gpu(struct i915_execbuffer *eb, 1244 - struct i915_vma *vma, 1245 - u64 offset, 1246 - u64 target_addr) 1457 + static int __reloc_entry_gpu(struct i915_execbuffer *eb, 1458 + struct i915_vma *vma, 1459 + u64 offset, 1460 + u64 target_addr) 1247 1461 { 1248 1462 const unsigned int gen = eb->reloc_cache.gen; 1249 1463 unsigned int len; ··· 1259 1473 1260 1474 batch = reloc_gpu(eb, vma, len); 1261 1475 if (IS_ERR(batch)) 1262 - return false; 1476 + return PTR_ERR(batch); 1263 1477 1264 1478 addr = gen8_canonical_addr(vma->node.start + offset); 1265 1479 if (gen >= 8) { ··· 1308 1522 *batch++ = target_addr; 1309 1523 } 1310 1524 1311 - return true; 1312 - } 1313 - 1314 - static bool reloc_entry_gpu(struct i915_execbuffer *eb, 1315 - struct i915_vma *vma, 1316 - u64 offset, 1317 - u64 target_addr) 1318 - { 1319 - if (eb->reloc_cache.vaddr) 1320 - return false; 1321 - 1322 - if (!use_reloc_gpu(vma)) 1323 - return false; 1324 - 1325 - return __reloc_entry_gpu(eb, vma, offset, target_addr); 1525 + return 0; 1326 1526 } 1327 1527 1328 1528 static u64 1329 - relocate_entry(struct i915_vma *vma, 1529 + relocate_entry(struct i915_execbuffer *eb, 1530 + struct i915_vma *vma, 1330 1531 const struct drm_i915_gem_relocation_entry *reloc, 1331 - struct i915_execbuffer *eb, 1332 1532 const struct i915_vma *target) 1333 1533 { 1334 1534 u64 target_addr = relocation_target(reloc, target); 1335 - u64 offset = reloc->offset; 1535 + int err; 1336 1536 1337 - if (!reloc_entry_gpu(eb, vma, offset, target_addr)) { 1338 - bool wide = eb->reloc_cache.use_64bit_reloc; 1339 - void *vaddr; 1340 - 1341 - repeat: 1342 - vaddr = reloc_vaddr(vma->obj, 1343 - &eb->reloc_cache, 1344 - offset >> PAGE_SHIFT); 1345 - if (IS_ERR(vaddr)) 1346 - return PTR_ERR(vaddr); 1347 - 1348 - GEM_BUG_ON(!IS_ALIGNED(offset, sizeof(u32))); 1349 - clflush_write32(vaddr + offset_in_page(offset), 1350 - lower_32_bits(target_addr), 1351 - eb->reloc_cache.vaddr); 1352 - 1353 - if (wide) { 1354 - offset += sizeof(u32); 1355 - target_addr >>= 32; 1356 - wide = false; 1357 - goto repeat; 1358 - } 1359 - } 1537 + err = __reloc_entry_gpu(eb, vma, reloc->offset, target_addr); 1538 + if (err) 1539 + return err; 1360 1540 1361 1541 return target->node.start | UPDATE; 1362 1542 } ··· 1387 1635 * If the relocation already has the right value in it, no 1388 1636 * more work needs to be done. 1389 1637 */ 1390 - if (!DBG_FORCE_RELOC && 1391 - gen8_canonical_addr(target->vma->node.start) == reloc->presumed_offset) 1638 + if (gen8_canonical_addr(target->vma->node.start) == reloc->presumed_offset) 1392 1639 return 0; 1393 1640 1394 1641 /* Check that the relocation address is valid... */ ··· 1419 1668 ev->flags &= ~EXEC_OBJECT_ASYNC; 1420 1669 1421 1670 /* and update the user's relocation entry */ 1422 - return relocate_entry(ev->vma, reloc, eb, target->vma); 1671 + return relocate_entry(eb, ev->vma, reloc, target->vma); 1423 1672 } 1424 1673 1425 1674 static int eb_relocate_vma(struct i915_execbuffer *eb, struct eb_vma *ev) ··· 1457 1706 * this is bad and so lockdep complains vehemently. 1458 1707 */ 1459 1708 copied = __copy_from_user(r, urelocs, count * sizeof(r[0])); 1460 - if (unlikely(copied)) { 1461 - remain = -EFAULT; 1462 - goto out; 1463 - } 1709 + if (unlikely(copied)) 1710 + return -EFAULT; 1464 1711 1465 1712 remain -= count; 1466 1713 do { ··· 1466 1717 1467 1718 if (likely(offset == 0)) { 1468 1719 } else if ((s64)offset < 0) { 1469 - remain = (int)offset; 1470 - goto out; 1720 + return (int)offset; 1471 1721 } else { 1472 1722 /* 1473 1723 * Note that reporting an error now ··· 1496 1748 } while (r++, --count); 1497 1749 urelocs += ARRAY_SIZE(stack); 1498 1750 } while (remain); 1499 - out: 1500 - reloc_cache_reset(&eb->reloc_cache); 1501 - return remain; 1751 + 1752 + return 0; 1502 1753 } 1503 1754 1504 1755 static int eb_relocate(struct i915_execbuffer *eb) ··· 2405 2658 eb.i915 = i915; 2406 2659 eb.file = file; 2407 2660 eb.args = args; 2408 - if (DBG_FORCE_RELOC || !(args->flags & I915_EXEC_NO_RELOC)) 2661 + if (!(args->flags & I915_EXEC_NO_RELOC)) 2409 2662 args->flags |= __EXEC_HAS_RELOC; 2410 2663 2411 2664 eb.exec = exec;

+6 -15

drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c

··· 37 37 return err; 38 38 39 39 /* 8-Byte aligned */ 40 - if (!__reloc_entry_gpu(eb, vma, 41 - offsets[0] * sizeof(u32), 42 - 0)) { 43 - err = -EIO; 40 + err = __reloc_entry_gpu(eb, vma, offsets[0] * sizeof(u32), 0); 41 + if (err) 44 42 goto unpin_vma; 45 - } 46 43 47 44 /* !8-Byte aligned */ 48 - if (!__reloc_entry_gpu(eb, vma, 49 - offsets[1] * sizeof(u32), 50 - 1)) { 51 - err = -EIO; 45 + err = __reloc_entry_gpu(eb, vma, offsets[1] * sizeof(u32), 1); 46 + if (err) 52 47 goto unpin_vma; 53 - } 54 48 55 49 /* Skip to the end of the cmd page */ 56 50 i = PAGE_SIZE / sizeof(u32) - RELOC_TAIL - 1; ··· 54 60 eb->reloc_cache.rq_size += i; 55 61 56 62 /* Force batch chaining */ 57 - if (!__reloc_entry_gpu(eb, vma, 58 - offsets[2] * sizeof(u32), 59 - 2)) { 60 - err = -EIO; 63 + err = __reloc_entry_gpu(eb, vma, offsets[2] * sizeof(u32), 2); 64 + if (err) 61 65 goto unpin_vma; 62 - } 63 66 64 67 GEM_BUG_ON(!eb->reloc_cache.rq); 65 68 rq = i915_request_get(eb->reloc_cache.rq);