Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/i915: Move cmd parser pinning to execbuffer

We need to get rid of allocations in the cmd parser, because it needs
to be called from a signaling context, first move all pinning to
execbuf, where we already hold all locks.

Allocate jump_whitelist in the execbuffer, and add annotations around
intel_engine_cmd_parser(), to ensure we only call the command parser
without allocating any memory, or taking any locks we're not supposed to.

Because i915_gem_object_get_page() may also allocate memory, add a
path to i915_gem_object_get_sg() that prevents memory allocations,
and walk the sg list manually. It should be similarly fast.

This has the added benefit of being able to catch all memory allocation
errors before the point of no return, and return -ENOMEM safely to the
execbuf submitter.

Signed-off-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Acked-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20210323155059.628690-4-maarten.lankhorst@linux.intel.com

authored by

Maarten Lankhorst and committed by
Daniel Vetter
0edbb9ba 2c8ab333

+140 -78
+68 -6
drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
··· 28 28 #include "i915_sw_fence_work.h" 29 29 #include "i915_trace.h" 30 30 #include "i915_user_extensions.h" 31 + #include "i915_memcpy.h" 31 32 32 33 struct eb_vma { 33 34 struct i915_vma *vma; ··· 2282 2281 struct i915_vma *trampoline; 2283 2282 unsigned long batch_offset; 2284 2283 unsigned long batch_length; 2284 + unsigned long *jump_whitelist; 2285 + const void *batch_map; 2286 + void *shadow_map; 2285 2287 }; 2286 2288 2287 2289 static int __eb_parse(struct dma_fence_work *work) 2288 2290 { 2289 2291 struct eb_parse_work *pw = container_of(work, typeof(*pw), base); 2292 + int ret; 2293 + bool cookie; 2290 2294 2291 - return intel_engine_cmd_parser(pw->engine, 2292 - pw->batch, 2293 - pw->batch_offset, 2294 - pw->batch_length, 2295 - pw->shadow, 2296 - pw->trampoline); 2295 + cookie = dma_fence_begin_signalling(); 2296 + ret = intel_engine_cmd_parser(pw->engine, 2297 + pw->batch, 2298 + pw->batch_offset, 2299 + pw->batch_length, 2300 + pw->shadow, 2301 + pw->jump_whitelist, 2302 + pw->shadow_map, 2303 + pw->batch_map); 2304 + dma_fence_end_signalling(cookie); 2305 + 2306 + return ret; 2297 2307 } 2298 2308 2299 2309 static void __eb_parse_release(struct dma_fence_work *work) 2300 2310 { 2301 2311 struct eb_parse_work *pw = container_of(work, typeof(*pw), base); 2312 + 2313 + if (!IS_ERR_OR_NULL(pw->jump_whitelist)) 2314 + kfree(pw->jump_whitelist); 2315 + 2316 + if (pw->batch_map) 2317 + i915_gem_object_unpin_map(pw->batch->obj); 2318 + else 2319 + i915_gem_object_unpin_pages(pw->batch->obj); 2320 + 2321 + i915_gem_object_unpin_map(pw->shadow->obj); 2302 2322 2303 2323 if (pw->trampoline) 2304 2324 i915_active_release(&pw->trampoline->active); ··· 2370 2348 struct i915_vma *trampoline) 2371 2349 { 2372 2350 struct eb_parse_work *pw; 2351 + struct drm_i915_gem_object *batch = eb->batch->vma->obj; 2352 + bool needs_clflush; 2373 2353 int err; 2374 2354 2375 2355 GEM_BUG_ON(overflows_type(eb->batch_start_offset, pw->batch_offset)); ··· 2393 2369 err = i915_active_acquire(&trampoline->active); 2394 2370 if (err) 2395 2371 goto err_shadow; 2372 + } 2373 + 2374 + pw->shadow_map = i915_gem_object_pin_map(shadow->obj, I915_MAP_WB); 2375 + if (IS_ERR(pw->shadow_map)) { 2376 + err = PTR_ERR(pw->shadow_map); 2377 + goto err_trampoline; 2378 + } 2379 + 2380 + needs_clflush = 2381 + !(batch->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ); 2382 + 2383 + pw->batch_map = ERR_PTR(-ENODEV); 2384 + if (needs_clflush && i915_has_memcpy_from_wc()) 2385 + pw->batch_map = i915_gem_object_pin_map(batch, I915_MAP_WC); 2386 + 2387 + if (IS_ERR(pw->batch_map)) { 2388 + err = i915_gem_object_pin_pages(batch); 2389 + if (err) 2390 + goto err_unmap_shadow; 2391 + pw->batch_map = NULL; 2392 + } 2393 + 2394 + pw->jump_whitelist = 2395 + intel_engine_cmd_parser_alloc_jump_whitelist(eb->batch_len, 2396 + trampoline); 2397 + if (IS_ERR(pw->jump_whitelist)) { 2398 + err = PTR_ERR(pw->jump_whitelist); 2399 + goto err_unmap_batch; 2396 2400 } 2397 2401 2398 2402 dma_fence_work_init(&pw->base, &eb_parse_ops); ··· 2462 2410 dma_fence_work_commit_imm(&pw->base); 2463 2411 return err; 2464 2412 2413 + err_unmap_batch: 2414 + if (pw->batch_map) 2415 + i915_gem_object_unpin_map(batch); 2416 + else 2417 + i915_gem_object_unpin_pages(batch); 2418 + err_unmap_shadow: 2419 + i915_gem_object_unpin_map(shadow->obj); 2420 + err_trampoline: 2421 + if (trampoline) 2422 + i915_active_release(&trampoline->active); 2465 2423 err_shadow: 2466 2424 i915_active_release(&shadow->active); 2467 2425 err_batch:
+5 -5
drivers/gpu/drm/i915/gem/i915_gem_object.h
··· 299 299 __i915_gem_object_get_sg(struct drm_i915_gem_object *obj, 300 300 struct i915_gem_object_page_iter *iter, 301 301 unsigned int n, 302 - unsigned int *offset); 302 + unsigned int *offset, bool allow_alloc); 303 303 304 304 static inline struct scatterlist * 305 305 i915_gem_object_get_sg(struct drm_i915_gem_object *obj, 306 306 unsigned int n, 307 - unsigned int *offset) 307 + unsigned int *offset, bool allow_alloc) 308 308 { 309 - return __i915_gem_object_get_sg(obj, &obj->mm.get_page, n, offset); 309 + return __i915_gem_object_get_sg(obj, &obj->mm.get_page, n, offset, allow_alloc); 310 310 } 311 311 312 312 static inline struct scatterlist * 313 313 i915_gem_object_get_sg_dma(struct drm_i915_gem_object *obj, 314 314 unsigned int n, 315 - unsigned int *offset) 315 + unsigned int *offset, bool allow_alloc) 316 316 { 317 - return __i915_gem_object_get_sg(obj, &obj->mm.get_dma_page, n, offset); 317 + return __i915_gem_object_get_sg(obj, &obj->mm.get_dma_page, n, offset, allow_alloc); 318 318 } 319 319 320 320 struct page *
+17 -4
drivers/gpu/drm/i915/gem/i915_gem_pages.c
··· 448 448 __i915_gem_object_get_sg(struct drm_i915_gem_object *obj, 449 449 struct i915_gem_object_page_iter *iter, 450 450 unsigned int n, 451 - unsigned int *offset) 451 + unsigned int *offset, 452 + bool allow_alloc) 452 453 { 453 454 const bool dma = iter == &obj->mm.get_dma_page; 454 455 struct scatterlist *sg; ··· 470 469 */ 471 470 if (n < READ_ONCE(iter->sg_idx)) 472 471 goto lookup; 472 + 473 + if (!allow_alloc) 474 + goto manual_lookup; 473 475 474 476 mutex_lock(&iter->lock); 475 477 ··· 523 519 if (unlikely(n < idx)) /* insertion completed by another thread */ 524 520 goto lookup; 525 521 526 - /* In case we failed to insert the entry into the radixtree, we need 522 + goto manual_walk; 523 + 524 + manual_lookup: 525 + idx = 0; 526 + sg = obj->mm.pages->sgl; 527 + count = __sg_page_count(sg); 528 + 529 + manual_walk: 530 + /* 531 + * In case we failed to insert the entry into the radixtree, we need 527 532 * to look beyond the current sg. 528 533 */ 529 534 while (idx + count <= n) { ··· 579 566 580 567 GEM_BUG_ON(!i915_gem_object_has_struct_page(obj)); 581 568 582 - sg = i915_gem_object_get_sg(obj, n, &offset); 569 + sg = i915_gem_object_get_sg(obj, n, &offset, true); 583 570 return nth_page(sg_page(sg), offset); 584 571 } 585 572 ··· 605 592 struct scatterlist *sg; 606 593 unsigned int offset; 607 594 608 - sg = i915_gem_object_get_sg_dma(obj, n, &offset); 595 + sg = i915_gem_object_get_sg_dma(obj, n, &offset, true); 609 596 610 597 if (len) 611 598 *len = sg_dma_len(sg) - (offset << PAGE_SHIFT);
+1 -1
drivers/gpu/drm/i915/gt/intel_ggtt.c
··· 1420 1420 if (ret) 1421 1421 goto err_sg_alloc; 1422 1422 1423 - iter = i915_gem_object_get_sg_dma(obj, view->partial.offset, &offset); 1423 + iter = i915_gem_object_get_sg_dma(obj, view->partial.offset, &offset, true); 1424 1424 GEM_BUG_ON(!iter); 1425 1425 1426 1426 sg = st->sgl;
+41 -59
drivers/gpu/drm/i915/i915_cmd_parser.c
··· 1144 1144 /* Returns a vmap'd pointer to dst_obj, which the caller must unmap */ 1145 1145 static u32 *copy_batch(struct drm_i915_gem_object *dst_obj, 1146 1146 struct drm_i915_gem_object *src_obj, 1147 - unsigned long offset, unsigned long length) 1147 + unsigned long offset, unsigned long length, 1148 + void *dst, const void *src) 1148 1149 { 1149 - bool needs_clflush; 1150 - void *dst, *src; 1151 - int ret; 1152 - 1153 - dst = i915_gem_object_pin_map(dst_obj, I915_MAP_WB); 1154 - if (IS_ERR(dst)) 1155 - return dst; 1156 - 1157 - ret = i915_gem_object_pin_pages(src_obj); 1158 - if (ret) { 1159 - i915_gem_object_unpin_map(dst_obj); 1160 - return ERR_PTR(ret); 1161 - } 1162 - 1163 - needs_clflush = 1150 + bool needs_clflush = 1164 1151 !(src_obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ); 1165 1152 1166 - src = ERR_PTR(-ENODEV); 1167 - if (needs_clflush && i915_has_memcpy_from_wc()) { 1168 - src = i915_gem_object_pin_map(src_obj, I915_MAP_WC); 1169 - if (!IS_ERR(src)) { 1170 - i915_unaligned_memcpy_from_wc(dst, 1171 - src + offset, 1172 - length); 1173 - i915_gem_object_unpin_map(src_obj); 1174 - } 1175 - } 1176 - if (IS_ERR(src)) { 1177 - unsigned long x, n, remain; 1153 + if (src) { 1154 + GEM_BUG_ON(!needs_clflush); 1155 + i915_unaligned_memcpy_from_wc(dst, src + offset, length); 1156 + } else { 1157 + struct scatterlist *sg; 1178 1158 void *ptr; 1159 + unsigned int x, sg_ofs; 1160 + unsigned long remain; 1179 1161 1180 1162 /* 1181 1163 * We can avoid clflushing partial cachelines before the write ··· 1174 1192 1175 1193 ptr = dst; 1176 1194 x = offset_in_page(offset); 1177 - for (n = offset >> PAGE_SHIFT; remain; n++) { 1178 - int len = min(remain, PAGE_SIZE - x); 1195 + sg = i915_gem_object_get_sg(src_obj, offset >> PAGE_SHIFT, &sg_ofs, false); 1179 1196 1180 - src = kmap_atomic(i915_gem_object_get_page(src_obj, n)); 1181 - if (needs_clflush) 1182 - drm_clflush_virt_range(src + x, len); 1183 - memcpy(ptr, src + x, len); 1184 - kunmap_atomic(src); 1197 + while (remain) { 1198 + unsigned long sg_max = sg->length >> PAGE_SHIFT; 1185 1199 1186 - ptr += len; 1187 - remain -= len; 1188 - x = 0; 1200 + for (; remain && sg_ofs < sg_max; sg_ofs++) { 1201 + unsigned long len = min(remain, PAGE_SIZE - x); 1202 + void *map; 1203 + 1204 + map = kmap_atomic(nth_page(sg_page(sg), sg_ofs)); 1205 + if (needs_clflush) 1206 + drm_clflush_virt_range(map + x, len); 1207 + memcpy(ptr, map + x, len); 1208 + kunmap_atomic(map); 1209 + 1210 + ptr += len; 1211 + remain -= len; 1212 + x = 0; 1213 + } 1214 + 1215 + sg_ofs = 0; 1216 + sg = sg_next(sg); 1189 1217 } 1190 1218 } 1191 - 1192 - i915_gem_object_unpin_pages(src_obj); 1193 1219 1194 1220 memset32(dst + length, 0, (dst_obj->base.size - length) / sizeof(u32)); 1195 1221 ··· 1360 1370 if (target_cmd_index == offset) 1361 1371 return 0; 1362 1372 1363 - if (IS_ERR(jump_whitelist)) 1364 - return PTR_ERR(jump_whitelist); 1365 - 1366 1373 if (!test_bit(target_cmd_index, jump_whitelist)) { 1367 1374 DRM_DEBUG("CMD: BB_START to 0x%llx not a previously executed cmd\n", 1368 1375 jump_target); ··· 1369 1382 return 0; 1370 1383 } 1371 1384 1372 - static unsigned long *alloc_whitelist(u32 batch_length) 1385 + unsigned long *intel_engine_cmd_parser_alloc_jump_whitelist(u32 batch_length, 1386 + bool trampoline) 1373 1387 { 1374 1388 unsigned long *jmp; 1389 + 1390 + if (trampoline) 1391 + return NULL; 1375 1392 1376 1393 /* 1377 1394 * We expect batch_length to be less than 256KiB for known users, ··· 1414 1423 unsigned long batch_offset, 1415 1424 unsigned long batch_length, 1416 1425 struct i915_vma *shadow, 1417 - bool trampoline) 1426 + unsigned long *jump_whitelist, 1427 + void *shadow_map, 1428 + const void *batch_map) 1418 1429 { 1419 1430 u32 *cmd, *batch_end, offset = 0; 1420 1431 struct drm_i915_cmd_descriptor default_desc = noop_desc; 1421 1432 const struct drm_i915_cmd_descriptor *desc = &default_desc; 1422 - unsigned long *jump_whitelist; 1423 1433 u64 batch_addr, shadow_addr; 1424 1434 int ret = 0; 1435 + bool trampoline = !jump_whitelist; 1425 1436 1426 1437 GEM_BUG_ON(!IS_ALIGNED(batch_offset, sizeof(*cmd))); 1427 1438 GEM_BUG_ON(!IS_ALIGNED(batch_length, sizeof(*cmd))); ··· 1431 1438 batch->size)); 1432 1439 GEM_BUG_ON(!batch_length); 1433 1440 1434 - cmd = copy_batch(shadow->obj, batch->obj, batch_offset, batch_length); 1435 - if (IS_ERR(cmd)) { 1436 - DRM_DEBUG("CMD: Failed to copy batch\n"); 1437 - return PTR_ERR(cmd); 1438 - } 1439 - 1440 - jump_whitelist = NULL; 1441 - if (!trampoline) 1442 - /* Defer failure until attempted use */ 1443 - jump_whitelist = alloc_whitelist(batch_length); 1441 + cmd = copy_batch(shadow->obj, batch->obj, batch_offset, batch_length, 1442 + shadow_map, batch_map); 1444 1443 1445 1444 shadow_addr = gen8_canonical_addr(shadow->node.start); 1446 1445 batch_addr = gen8_canonical_addr(batch->node.start + batch_offset); ··· 1533 1548 1534 1549 i915_gem_object_flush_map(shadow->obj); 1535 1550 1536 - if (!IS_ERR_OR_NULL(jump_whitelist)) 1537 - kfree(jump_whitelist); 1538 - i915_gem_object_unpin_map(shadow->obj); 1539 1551 return ret; 1540 1552 } 1541 1553
+6 -1
drivers/gpu/drm/i915/i915_drv.h
··· 1946 1946 int i915_cmd_parser_get_version(struct drm_i915_private *dev_priv); 1947 1947 int intel_engine_init_cmd_parser(struct intel_engine_cs *engine); 1948 1948 void intel_engine_cleanup_cmd_parser(struct intel_engine_cs *engine); 1949 + unsigned long *intel_engine_cmd_parser_alloc_jump_whitelist(u32 batch_length, 1950 + bool trampoline); 1951 + 1949 1952 int intel_engine_cmd_parser(struct intel_engine_cs *engine, 1950 1953 struct i915_vma *batch, 1951 1954 unsigned long batch_offset, 1952 1955 unsigned long batch_length, 1953 1956 struct i915_vma *shadow, 1954 - bool trampoline); 1957 + unsigned long *jump_whitelist, 1958 + void *shadow_map, 1959 + const void *batch_map); 1955 1960 #define I915_CMD_PARSER_TRAMPOLINE_SIZE 8 1956 1961 1957 1962 /* intel_device_info.c */
+1 -1
drivers/gpu/drm/i915/i915_memcpy.c
··· 135 135 * accepts that its arguments may not be aligned, but are valid for the 136 136 * potential 16-byte read past the end. 137 137 */ 138 - void i915_unaligned_memcpy_from_wc(void *dst, void *src, unsigned long len) 138 + void i915_unaligned_memcpy_from_wc(void *dst, const void *src, unsigned long len) 139 139 { 140 140 unsigned long addr; 141 141
+1 -1
drivers/gpu/drm/i915/i915_memcpy.h
··· 13 13 void i915_memcpy_init_early(struct drm_i915_private *i915); 14 14 15 15 bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len); 16 - void i915_unaligned_memcpy_from_wc(void *dst, void *src, unsigned long len); 16 + void i915_unaligned_memcpy_from_wc(void *dst, const void *src, unsigned long len); 17 17 18 18 /* The movntdqa instructions used for memcpy-from-wc require 16-byte alignment, 19 19 * as well as SSE4.1 support. i915_memcpy_from_wc() will report if it cannot