Merge tag 'drm-intel-gt-next-2021-10-21' of git://anongit.freedesktop.org/drm/drm-intel into drm-next

-122

Documentation/gpu/rfc/i915_parallel_execbuf.h

··· 1 - /* SPDX-License-Identifier: MIT */ 2 - /* 3 - * Copyright © 2021 Intel Corporation 4 - */ 5 - 6 - #define I915_CONTEXT_ENGINES_EXT_PARALLEL_SUBMIT 2 /* see i915_context_engines_parallel_submit */ 7 - 8 - /** 9 - * struct drm_i915_context_engines_parallel_submit - Configure engine for 10 - * parallel submission. 11 - * 12 - * Setup a slot in the context engine map to allow multiple BBs to be submitted 13 - * in a single execbuf IOCTL. Those BBs will then be scheduled to run on the GPU 14 - * in parallel. Multiple hardware contexts are created internally in the i915 15 - * run these BBs. Once a slot is configured for N BBs only N BBs can be 16 - * submitted in each execbuf IOCTL and this is implicit behavior e.g. The user 17 - * doesn't tell the execbuf IOCTL there are N BBs, the execbuf IOCTL knows how 18 - * many BBs there are based on the slot's configuration. The N BBs are the last 19 - * N buffer objects or first N if I915_EXEC_BATCH_FIRST is set. 20 - * 21 - * The default placement behavior is to create implicit bonds between each 22 - * context if each context maps to more than 1 physical engine (e.g. context is 23 - * a virtual engine). Also we only allow contexts of same engine class and these 24 - * contexts must be in logically contiguous order. Examples of the placement 25 - * behavior described below. Lastly, the default is to not allow BBs to 26 - * preempted mid BB rather insert coordinated preemption on all hardware 27 - * contexts between each set of BBs. Flags may be added in the future to change 28 - * both of these default behaviors. 29 - * 30 - * Returns -EINVAL if hardware context placement configuration is invalid or if 31 - * the placement configuration isn't supported on the platform / submission 32 - * interface. 33 - * Returns -ENODEV if extension isn't supported on the platform / submission 34 - * interface. 35 - * 36 - * .. code-block:: none 37 - * 38 - * Example 1 pseudo code: 39 - * CS[X] = generic engine of same class, logical instance X 40 - * INVALID = I915_ENGINE_CLASS_INVALID, I915_ENGINE_CLASS_INVALID_NONE 41 - * set_engines(INVALID) 42 - * set_parallel(engine_index=0, width=2, num_siblings=1, 43 - * engines=CS[0],CS[1]) 44 - * 45 - * Results in the following valid placement: 46 - * CS[0], CS[1] 47 - * 48 - * Example 2 pseudo code: 49 - * CS[X] = generic engine of same class, logical instance X 50 - * INVALID = I915_ENGINE_CLASS_INVALID, I915_ENGINE_CLASS_INVALID_NONE 51 - * set_engines(INVALID) 52 - * set_parallel(engine_index=0, width=2, num_siblings=2, 53 - * engines=CS[0],CS[2],CS[1],CS[3]) 54 - * 55 - * Results in the following valid placements: 56 - * CS[0], CS[1] 57 - * CS[2], CS[3] 58 - * 59 - * This can also be thought of as 2 virtual engines described by 2-D array 60 - * in the engines the field with bonds placed between each index of the 61 - * virtual engines. e.g. CS[0] is bonded to CS[1], CS[2] is bonded to 62 - * CS[3]. 63 - * VE[0] = CS[0], CS[2] 64 - * VE[1] = CS[1], CS[3] 65 - * 66 - * Example 3 pseudo code: 67 - * CS[X] = generic engine of same class, logical instance X 68 - * INVALID = I915_ENGINE_CLASS_INVALID, I915_ENGINE_CLASS_INVALID_NONE 69 - * set_engines(INVALID) 70 - * set_parallel(engine_index=0, width=2, num_siblings=2, 71 - * engines=CS[0],CS[1],CS[1],CS[3]) 72 - * 73 - * Results in the following valid and invalid placements: 74 - * CS[0], CS[1] 75 - * CS[1], CS[3] - Not logical contiguous, return -EINVAL 76 - */ 77 - struct drm_i915_context_engines_parallel_submit { 78 - /** 79 - * @base: base user extension. 80 - */ 81 - struct i915_user_extension base; 82 - 83 - /** 84 - * @engine_index: slot for parallel engine 85 - */ 86 - __u16 engine_index; 87 - 88 - /** 89 - * @width: number of contexts per parallel engine 90 - */ 91 - __u16 width; 92 - 93 - /** 94 - * @num_siblings: number of siblings per context 95 - */ 96 - __u16 num_siblings; 97 - 98 - /** 99 - * @mbz16: reserved for future use; must be zero 100 - */ 101 - __u16 mbz16; 102 - 103 - /** 104 - * @flags: all undefined flags must be zero, currently not defined flags 105 - */ 106 - __u64 flags; 107 - 108 - /** 109 - * @mbz64: reserved for future use; must be zero 110 - */ 111 - __u64 mbz64[3]; 112 - 113 - /** 114 - * @engines: 2-d array of engine instances to configure parallel engine 115 - * 116 - * length = width (i) * num_siblings (j) 117 - * index = j + i * num_siblings 118 - */ 119 - struct i915_engine_class_instance engines[0]; 120 - 121 - } __packed; 122 -

+2 -2

Documentation/gpu/rfc/i915_scheduler.rst

··· 135 135 drm_i915_context_engines_parallel_submit to the uAPI to implement this 136 136 extension. 137 137 138 - .. kernel-doc:: Documentation/gpu/rfc/i915_parallel_execbuf.h 139 - :functions: drm_i915_context_engines_parallel_submit 138 + .. kernel-doc:: include/uapi/drm/i915_drm.h 139 + :functions: i915_context_engines_parallel_submit 140 140 141 141 Extend execbuf2 IOCTL to support submitting N BBs in a single IOCTL 142 142 -------------------------------------------------------------------

+46 -13

drivers/gpu/drm/i915/gem/i915_gem_busy.c

··· 4 4 * Copyright © 2014-2016 Intel Corporation 5 5 */ 6 6 7 + #include <linux/dma-fence-array.h> 8 + 7 9 #include "gt/intel_engine.h" 8 10 9 11 #include "i915_gem_ioctls.h" ··· 38 36 } 39 37 40 38 static __always_inline unsigned int 41 - __busy_set_if_active(const struct dma_fence *fence, u32 (*flag)(u16 id)) 39 + __busy_set_if_active(struct dma_fence *fence, u32 (*flag)(u16 id)) 42 40 { 43 41 const struct i915_request *rq; 44 42 ··· 48 46 * to eventually flush us, but to minimise latency just ask the 49 47 * hardware. 50 48 * 51 - * Note we only report on the status of native fences. 49 + * Note we only report on the status of native fences and we currently 50 + * have two native fences: 51 + * 52 + * 1. A composite fence (dma_fence_array) constructed of i915 requests 53 + * created during a parallel submission. In this case we deconstruct the 54 + * composite fence into individual i915 requests and check the status of 55 + * each request. 56 + * 57 + * 2. A single i915 request. 52 58 */ 53 - if (!dma_fence_is_i915(fence)) 54 - return 0; 59 + if (dma_fence_is_array(fence)) { 60 + struct dma_fence_array *array = to_dma_fence_array(fence); 61 + struct dma_fence **child = array->fences; 62 + unsigned int nchild = array->num_fences; 55 63 56 - /* opencode to_request() in order to avoid const warnings */ 57 - rq = container_of(fence, const struct i915_request, fence); 58 - if (i915_request_completed(rq)) 59 - return 0; 64 + do { 65 + struct dma_fence *current_fence = *child++; 60 66 61 - /* Beware type-expansion follies! */ 62 - BUILD_BUG_ON(!typecheck(u16, rq->engine->uabi_class)); 63 - return flag(rq->engine->uabi_class); 67 + /* Not an i915 fence, can't be busy per above */ 68 + if (!dma_fence_is_i915(current_fence) || 69 + !test_bit(I915_FENCE_FLAG_COMPOSITE, 70 + &current_fence->flags)) { 71 + return 0; 72 + } 73 + 74 + rq = to_request(current_fence); 75 + if (!i915_request_completed(rq)) 76 + return flag(rq->engine->uabi_class); 77 + } while (--nchild); 78 + 79 + /* All requests in array complete, not busy */ 80 + return 0; 81 + } else { 82 + if (!dma_fence_is_i915(fence)) 83 + return 0; 84 + 85 + rq = to_request(fence); 86 + if (i915_request_completed(rq)) 87 + return 0; 88 + 89 + /* Beware type-expansion follies! */ 90 + BUILD_BUG_ON(!typecheck(u16, rq->engine->uabi_class)); 91 + return flag(rq->engine->uabi_class); 92 + } 64 93 } 65 94 66 95 static __always_inline unsigned int 67 - busy_check_reader(const struct dma_fence *fence) 96 + busy_check_reader(struct dma_fence *fence) 68 97 { 69 98 return __busy_set_if_active(fence, __busy_read_flag); 70 99 } 71 100 72 101 static __always_inline unsigned int 73 - busy_check_writer(const struct dma_fence *fence) 102 + busy_check_writer(struct dma_fence *fence) 74 103 { 75 104 if (!fence) 76 105 return 0;

+225 -2

drivers/gpu/drm/i915/gem/i915_gem_context.c

··· 556 556 return 0; 557 557 } 558 558 559 + static int 560 + set_proto_ctx_engines_parallel_submit(struct i915_user_extension __user *base, 561 + void *data) 562 + { 563 + struct i915_context_engines_parallel_submit __user *ext = 564 + container_of_user(base, typeof(*ext), base); 565 + const struct set_proto_ctx_engines *set = data; 566 + struct drm_i915_private *i915 = set->i915; 567 + u64 flags; 568 + int err = 0, n, i, j; 569 + u16 slot, width, num_siblings; 570 + struct intel_engine_cs **siblings = NULL; 571 + intel_engine_mask_t prev_mask; 572 + 573 + /* FIXME: This is NIY for execlists */ 574 + if (!(intel_uc_uses_guc_submission(&i915->gt.uc))) 575 + return -ENODEV; 576 + 577 + if (get_user(slot, &ext->engine_index)) 578 + return -EFAULT; 579 + 580 + if (get_user(width, &ext->width)) 581 + return -EFAULT; 582 + 583 + if (get_user(num_siblings, &ext->num_siblings)) 584 + return -EFAULT; 585 + 586 + if (slot >= set->num_engines) { 587 + drm_dbg(&i915->drm, "Invalid placement value, %d >= %d\n", 588 + slot, set->num_engines); 589 + return -EINVAL; 590 + } 591 + 592 + if (set->engines[slot].type != I915_GEM_ENGINE_TYPE_INVALID) { 593 + drm_dbg(&i915->drm, 594 + "Invalid placement[%d], already occupied\n", slot); 595 + return -EINVAL; 596 + } 597 + 598 + if (get_user(flags, &ext->flags)) 599 + return -EFAULT; 600 + 601 + if (flags) { 602 + drm_dbg(&i915->drm, "Unknown flags 0x%02llx", flags); 603 + return -EINVAL; 604 + } 605 + 606 + for (n = 0; n < ARRAY_SIZE(ext->mbz64); n++) { 607 + err = check_user_mbz(&ext->mbz64[n]); 608 + if (err) 609 + return err; 610 + } 611 + 612 + if (width < 2) { 613 + drm_dbg(&i915->drm, "Width (%d) < 2\n", width); 614 + return -EINVAL; 615 + } 616 + 617 + if (num_siblings < 1) { 618 + drm_dbg(&i915->drm, "Number siblings (%d) < 1\n", 619 + num_siblings); 620 + return -EINVAL; 621 + } 622 + 623 + siblings = kmalloc_array(num_siblings * width, 624 + sizeof(*siblings), 625 + GFP_KERNEL); 626 + if (!siblings) 627 + return -ENOMEM; 628 + 629 + /* Create contexts / engines */ 630 + for (i = 0; i < width; ++i) { 631 + intel_engine_mask_t current_mask = 0; 632 + struct i915_engine_class_instance prev_engine; 633 + 634 + for (j = 0; j < num_siblings; ++j) { 635 + struct i915_engine_class_instance ci; 636 + 637 + n = i * num_siblings + j; 638 + if (copy_from_user(&ci, &ext->engines[n], sizeof(ci))) { 639 + err = -EFAULT; 640 + goto out_err; 641 + } 642 + 643 + siblings[n] = 644 + intel_engine_lookup_user(i915, ci.engine_class, 645 + ci.engine_instance); 646 + if (!siblings[n]) { 647 + drm_dbg(&i915->drm, 648 + "Invalid sibling[%d]: { class:%d, inst:%d }\n", 649 + n, ci.engine_class, ci.engine_instance); 650 + err = -EINVAL; 651 + goto out_err; 652 + } 653 + 654 + if (n) { 655 + if (prev_engine.engine_class != 656 + ci.engine_class) { 657 + drm_dbg(&i915->drm, 658 + "Mismatched class %d, %d\n", 659 + prev_engine.engine_class, 660 + ci.engine_class); 661 + err = -EINVAL; 662 + goto out_err; 663 + } 664 + } 665 + 666 + prev_engine = ci; 667 + current_mask |= siblings[n]->logical_mask; 668 + } 669 + 670 + if (i > 0) { 671 + if (current_mask != prev_mask << 1) { 672 + drm_dbg(&i915->drm, 673 + "Non contiguous logical mask 0x%x, 0x%x\n", 674 + prev_mask, current_mask); 675 + err = -EINVAL; 676 + goto out_err; 677 + } 678 + } 679 + prev_mask = current_mask; 680 + } 681 + 682 + set->engines[slot].type = I915_GEM_ENGINE_TYPE_PARALLEL; 683 + set->engines[slot].num_siblings = num_siblings; 684 + set->engines[slot].width = width; 685 + set->engines[slot].siblings = siblings; 686 + 687 + return 0; 688 + 689 + out_err: 690 + kfree(siblings); 691 + 692 + return err; 693 + } 694 + 559 695 static const i915_user_extension_fn set_proto_ctx_engines_extensions[] = { 560 696 [I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE] = set_proto_ctx_engines_balance, 561 697 [I915_CONTEXT_ENGINES_EXT_BOND] = set_proto_ctx_engines_bond, 698 + [I915_CONTEXT_ENGINES_EXT_PARALLEL_SUBMIT] = 699 + set_proto_ctx_engines_parallel_submit, 562 700 }; 563 701 564 702 static int set_proto_ctx_engines(struct drm_i915_file_private *fpriv, ··· 932 794 GEM_BUG_ON(rcu_access_pointer(ce->gem_context)); 933 795 RCU_INIT_POINTER(ce->gem_context, ctx); 934 796 797 + GEM_BUG_ON(intel_context_is_pinned(ce)); 935 798 ce->ring_size = SZ_16K; 936 799 937 800 i915_vm_put(ce->vm); ··· 955 816 ret = intel_context_reconfigure_sseu(ce, sseu); 956 817 957 818 return ret; 819 + } 820 + 821 + static void __unpin_engines(struct i915_gem_engines *e, unsigned int count) 822 + { 823 + while (count--) { 824 + struct intel_context *ce = e->engines[count], *child; 825 + 826 + if (!ce || !test_bit(CONTEXT_PERMA_PIN, &ce->flags)) 827 + continue; 828 + 829 + for_each_child(ce, child) 830 + intel_context_unpin(child); 831 + intel_context_unpin(ce); 832 + } 833 + } 834 + 835 + static void unpin_engines(struct i915_gem_engines *e) 836 + { 837 + __unpin_engines(e, e->num_engines); 958 838 } 959 839 960 840 static void __free_engines(struct i915_gem_engines *e, unsigned int count) ··· 1091 933 return err; 1092 934 } 1093 935 936 + static int perma_pin_contexts(struct intel_context *ce) 937 + { 938 + struct intel_context *child; 939 + int i = 0, j = 0, ret; 940 + 941 + GEM_BUG_ON(!intel_context_is_parent(ce)); 942 + 943 + ret = intel_context_pin(ce); 944 + if (unlikely(ret)) 945 + return ret; 946 + 947 + for_each_child(ce, child) { 948 + ret = intel_context_pin(child); 949 + if (unlikely(ret)) 950 + goto unwind; 951 + ++i; 952 + } 953 + 954 + set_bit(CONTEXT_PERMA_PIN, &ce->flags); 955 + 956 + return 0; 957 + 958 + unwind: 959 + intel_context_unpin(ce); 960 + for_each_child(ce, child) { 961 + if (j++ < i) 962 + intel_context_unpin(child); 963 + else 964 + break; 965 + } 966 + 967 + return ret; 968 + } 969 + 1094 970 static struct i915_gem_engines *user_engines(struct i915_gem_context *ctx, 1095 971 unsigned int num_engines, 1096 972 struct i915_gem_proto_engine *pe) ··· 1138 946 e->num_engines = num_engines; 1139 947 1140 948 for (n = 0; n < num_engines; n++) { 1141 - struct intel_context *ce; 949 + struct intel_context *ce, *child; 1142 950 int ret; 1143 951 1144 952 switch (pe[n].type) { ··· 1148 956 1149 957 case I915_GEM_ENGINE_TYPE_BALANCED: 1150 958 ce = intel_engine_create_virtual(pe[n].siblings, 1151 - pe[n].num_siblings); 959 + pe[n].num_siblings, 0); 960 + break; 961 + 962 + case I915_GEM_ENGINE_TYPE_PARALLEL: 963 + ce = intel_engine_create_parallel(pe[n].siblings, 964 + pe[n].num_siblings, 965 + pe[n].width); 1152 966 break; 1153 967 1154 968 case I915_GEM_ENGINE_TYPE_INVALID: ··· 1174 976 if (ret) { 1175 977 err = ERR_PTR(ret); 1176 978 goto free_engines; 979 + } 980 + for_each_child(ce, child) { 981 + ret = intel_context_set_gem(child, ctx, pe->sseu); 982 + if (ret) { 983 + err = ERR_PTR(ret); 984 + goto free_engines; 985 + } 986 + } 987 + 988 + /* 989 + * XXX: Must be done after calling intel_context_set_gem as that 990 + * function changes the ring size. The ring is allocated when 991 + * the context is pinned. If the ring size is changed after 992 + * allocation we have a mismatch of the ring size and will cause 993 + * the context to hang. Presumably with a bit of reordering we 994 + * could move the perma-pin step to the backend function 995 + * intel_engine_create_parallel. 996 + */ 997 + if (pe[n].type == I915_GEM_ENGINE_TYPE_PARALLEL) { 998 + ret = perma_pin_contexts(ce); 999 + if (ret) { 1000 + err = ERR_PTR(ret); 1001 + goto free_engines; 1002 + } 1177 1003 } 1178 1004 } 1179 1005 ··· 1441 1219 1442 1220 /* Flush any concurrent set_engines() */ 1443 1221 mutex_lock(&ctx->engines_mutex); 1222 + unpin_engines(__context_engines_static(ctx)); 1444 1223 engines_idle_release(ctx, rcu_replace_pointer(ctx->engines, NULL, 1)); 1445 1224 i915_gem_context_set_closed(ctx); 1446 1225 mutex_unlock(&ctx->engines_mutex);

+13 -3

drivers/gpu/drm/i915/gem/i915_gem_context_types.h

··· 78 78 79 79 /** @I915_GEM_ENGINE_TYPE_BALANCED: A load-balanced engine set */ 80 80 I915_GEM_ENGINE_TYPE_BALANCED, 81 + 82 + /** @I915_GEM_ENGINE_TYPE_PARALLEL: A parallel engine set */ 83 + I915_GEM_ENGINE_TYPE_PARALLEL, 81 84 }; 82 85 83 86 /** 84 87 * struct i915_gem_proto_engine - prototype engine 85 88 * 86 89 * This struct describes an engine that a context may contain. Engines 87 - * have three types: 90 + * have four types: 88 91 * 89 92 * - I915_GEM_ENGINE_TYPE_INVALID: Invalid engines can be created but they 90 93 * show up as a NULL in i915_gem_engines::engines[i] and any attempt to ··· 100 97 * 101 98 * - I915_GEM_ENGINE_TYPE_BALANCED: A load-balanced engine set, described 102 99 * i915_gem_proto_engine::num_siblings and i915_gem_proto_engine::siblings. 100 + * 101 + * - I915_GEM_ENGINE_TYPE_PARALLEL: A parallel submission engine set, described 102 + * i915_gem_proto_engine::width, i915_gem_proto_engine::num_siblings, and 103 + * i915_gem_proto_engine::siblings. 103 104 */ 104 105 struct i915_gem_proto_engine { 105 106 /** @type: Type of this engine */ ··· 112 105 /** @engine: Engine, for physical */ 113 106 struct intel_engine_cs *engine; 114 107 115 - /** @num_siblings: Number of balanced siblings */ 108 + /** @num_siblings: Number of balanced or parallel siblings */ 116 109 unsigned int num_siblings; 117 110 118 - /** @siblings: Balanced siblings */ 111 + /** @width: Width of each sibling */ 112 + unsigned int width; 113 + 114 + /** @siblings: Balanced siblings or num_siblings * width for parallel */ 119 115 struct intel_engine_cs **siblings; 120 116 121 117 /** @sseu: Client-set SSEU parameters */

+7 -2

drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c

··· 232 232 233 233 static int i915_gem_object_get_pages_dmabuf(struct drm_i915_gem_object *obj) 234 234 { 235 + struct drm_i915_private *i915 = to_i915(obj->base.dev); 235 236 struct sg_table *pages; 236 237 unsigned int sg_page_sizes; 237 238 ··· 243 242 if (IS_ERR(pages)) 244 243 return PTR_ERR(pages); 245 244 246 - sg_page_sizes = i915_sg_dma_sizes(pages->sgl); 245 + /* XXX: consider doing a vmap flush or something */ 246 + if (!HAS_LLC(i915) || i915_gem_object_can_bypass_llc(obj)) 247 + wbinvd_on_all_cpus(); 247 248 249 + sg_page_sizes = i915_sg_dma_sizes(pages->sgl); 248 250 __i915_gem_object_set_pages(obj, pages, sg_page_sizes); 249 251 250 252 return 0; ··· 305 301 } 306 302 307 303 drm_gem_private_object_init(dev, &obj->base, dma_buf->size); 308 - i915_gem_object_init(obj, &i915_gem_object_dmabuf_ops, &lock_class, 0); 304 + i915_gem_object_init(obj, &i915_gem_object_dmabuf_ops, &lock_class, 305 + I915_BO_ALLOC_USER); 309 306 obj->base.import_attach = attach; 310 307 obj->base.resv = dma_buf->resv; 311 308

+558 -239

drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c

··· 246 246 struct drm_i915_gem_exec_object2 *exec; /** ioctl execobj[] */ 247 247 struct eb_vma *vma; 248 248 249 - struct intel_engine_cs *engine; /** engine to queue the request to */ 249 + struct intel_gt *gt; /* gt for the execbuf */ 250 250 struct intel_context *context; /* logical state for the request */ 251 251 struct i915_gem_context *gem_context; /** caller's context */ 252 252 253 - struct i915_request *request; /** our request to build */ 254 - struct eb_vma *batch; /** identity of the batch obj/vma */ 253 + /** our requests to build */ 254 + struct i915_request *requests[MAX_ENGINE_INSTANCE + 1]; 255 + /** identity of the batch obj/vma */ 256 + struct eb_vma *batches[MAX_ENGINE_INSTANCE + 1]; 255 257 struct i915_vma *trampoline; /** trampoline used for chaining */ 258 + 259 + /** used for excl fence in dma_resv objects when > 1 BB submitted */ 260 + struct dma_fence *composite_fence; 256 261 257 262 /** actual size of execobj[] as we may extend it for the cmdparser */ 258 263 unsigned int buffer_count; 264 + 265 + /* number of batches in execbuf IOCTL */ 266 + unsigned int num_batches; 259 267 260 268 /** list of vma not yet bound during reservation phase */ 261 269 struct list_head unbound; ··· 291 283 292 284 u64 invalid_flags; /** Set of execobj.flags that are invalid */ 293 285 294 - u64 batch_len; /** Length of batch within object */ 286 + /** Length of batch within object */ 287 + u64 batch_len[MAX_ENGINE_INSTANCE + 1]; 295 288 u32 batch_start_offset; /** Location within object of batch */ 296 289 u32 batch_flags; /** Flags composed for emit_bb_start() */ 297 290 struct intel_gt_buffer_pool_node *batch_pool; /** pool node for batch buffer */ ··· 310 301 }; 311 302 312 303 static int eb_parse(struct i915_execbuffer *eb); 313 - static struct i915_request *eb_pin_engine(struct i915_execbuffer *eb, 314 - bool throttle); 304 + static int eb_pin_engine(struct i915_execbuffer *eb, bool throttle); 315 305 static void eb_unpin_engine(struct i915_execbuffer *eb); 316 306 317 307 static inline bool eb_use_cmdparser(const struct i915_execbuffer *eb) 318 308 { 319 - return intel_engine_requires_cmd_parser(eb->engine) || 320 - (intel_engine_using_cmd_parser(eb->engine) && 309 + return intel_engine_requires_cmd_parser(eb->context->engine) || 310 + (intel_engine_using_cmd_parser(eb->context->engine) && 321 311 eb->args->batch_len); 322 312 } 323 313 ··· 543 535 return 0; 544 536 } 545 537 546 - static void 538 + static inline bool 539 + is_batch_buffer(struct i915_execbuffer *eb, unsigned int buffer_idx) 540 + { 541 + return eb->args->flags & I915_EXEC_BATCH_FIRST ? 542 + buffer_idx < eb->num_batches : 543 + buffer_idx >= eb->args->buffer_count - eb->num_batches; 544 + } 545 + 546 + static int 547 547 eb_add_vma(struct i915_execbuffer *eb, 548 - unsigned int i, unsigned batch_idx, 548 + unsigned int *current_batch, 549 + unsigned int i, 549 550 struct i915_vma *vma) 550 551 { 552 + struct drm_i915_private *i915 = eb->i915; 551 553 struct drm_i915_gem_exec_object2 *entry = &eb->exec[i]; 552 554 struct eb_vma *ev = &eb->vma[i]; 553 555 ··· 584 566 * Note that actual hangs have only been observed on gen7, but for 585 567 * paranoia do it everywhere. 586 568 */ 587 - if (i == batch_idx) { 569 + if (is_batch_buffer(eb, i)) { 588 570 if (entry->relocation_count && 589 571 !(ev->flags & EXEC_OBJECT_PINNED)) 590 572 ev->flags |= __EXEC_OBJECT_NEEDS_BIAS; 591 573 if (eb->reloc_cache.has_fence) 592 574 ev->flags |= EXEC_OBJECT_NEEDS_FENCE; 593 575 594 - eb->batch = ev; 576 + eb->batches[*current_batch] = ev; 577 + 578 + if (unlikely(ev->flags & EXEC_OBJECT_WRITE)) { 579 + drm_dbg(&i915->drm, 580 + "Attempting to use self-modifying batch buffer\n"); 581 + return -EINVAL; 582 + } 583 + 584 + if (range_overflows_t(u64, 585 + eb->batch_start_offset, 586 + eb->args->batch_len, 587 + ev->vma->size)) { 588 + drm_dbg(&i915->drm, "Attempting to use out-of-bounds batch\n"); 589 + return -EINVAL; 590 + } 591 + 592 + if (eb->args->batch_len == 0) 593 + eb->batch_len[*current_batch] = ev->vma->size - 594 + eb->batch_start_offset; 595 + else 596 + eb->batch_len[*current_batch] = eb->args->batch_len; 597 + if (unlikely(eb->batch_len[*current_batch] == 0)) { /* impossible! */ 598 + drm_dbg(&i915->drm, "Invalid batch length\n"); 599 + return -EINVAL; 600 + } 601 + 602 + ++*current_batch; 595 603 } 604 + 605 + return 0; 596 606 } 597 607 598 608 static inline int use_cpu_reloc(const struct reloc_cache *cache, ··· 764 718 } while (1); 765 719 } 766 720 767 - static unsigned int eb_batch_index(const struct i915_execbuffer *eb) 768 - { 769 - if (eb->args->flags & I915_EXEC_BATCH_FIRST) 770 - return 0; 771 - else 772 - return eb->buffer_count - 1; 773 - } 774 - 775 721 static int eb_select_context(struct i915_execbuffer *eb) 776 722 { 777 723 struct i915_gem_context *ctx; ··· 884 846 885 847 static int eb_lookup_vmas(struct i915_execbuffer *eb) 886 848 { 887 - struct drm_i915_private *i915 = eb->i915; 888 - unsigned int batch = eb_batch_index(eb); 889 - unsigned int i; 849 + unsigned int i, current_batch = 0; 890 850 int err = 0; 891 851 892 852 INIT_LIST_HEAD(&eb->relocs); ··· 904 868 goto err; 905 869 } 906 870 907 - eb_add_vma(eb, i, batch, vma); 871 + err = eb_add_vma(eb, &current_batch, i, vma); 872 + if (err) 873 + return err; 908 874 909 875 if (i915_gem_object_is_userptr(vma->obj)) { 910 876 err = i915_gem_object_userptr_submit_init(vma->obj); ··· 927 889 eb->vma[i].flags |= __EXEC_OBJECT_USERPTR_INIT; 928 890 eb->args->flags |= __EXEC_USERPTR_USED; 929 891 } 930 - } 931 - 932 - if (unlikely(eb->batch->flags & EXEC_OBJECT_WRITE)) { 933 - drm_dbg(&i915->drm, 934 - "Attempting to use self-modifying batch buffer\n"); 935 - return -EINVAL; 936 - } 937 - 938 - if (range_overflows_t(u64, 939 - eb->batch_start_offset, eb->batch_len, 940 - eb->batch->vma->size)) { 941 - drm_dbg(&i915->drm, "Attempting to use out-of-bounds batch\n"); 942 - return -EINVAL; 943 - } 944 - 945 - if (eb->batch_len == 0) 946 - eb->batch_len = eb->batch->vma->size - eb->batch_start_offset; 947 - if (unlikely(eb->batch_len == 0)) { /* impossible! */ 948 - drm_dbg(&i915->drm, "Invalid batch length\n"); 949 - return -EINVAL; 950 892 } 951 893 952 894 return 0; ··· 1661 1643 return 0; 1662 1644 } 1663 1645 1664 - static noinline int eb_relocate_parse_slow(struct i915_execbuffer *eb, 1665 - struct i915_request *rq) 1646 + static noinline int eb_relocate_parse_slow(struct i915_execbuffer *eb) 1666 1647 { 1667 1648 bool have_copy = false; 1668 1649 struct eb_vma *ev; ··· 1676 1659 /* We may process another execbuffer during the unlock... */ 1677 1660 eb_release_vmas(eb, false); 1678 1661 i915_gem_ww_ctx_fini(&eb->ww); 1679 - 1680 - if (rq) { 1681 - /* nonblocking is always false */ 1682 - if (i915_request_wait(rq, I915_WAIT_INTERRUPTIBLE, 1683 - MAX_SCHEDULE_TIMEOUT) < 0) { 1684 - i915_request_put(rq); 1685 - rq = NULL; 1686 - 1687 - err = -EINTR; 1688 - goto err_relock; 1689 - } 1690 - 1691 - i915_request_put(rq); 1692 - rq = NULL; 1693 - } 1694 1662 1695 1663 /* 1696 1664 * We take 3 passes through the slowpatch. ··· 1703 1701 if (!err) 1704 1702 err = eb_reinit_userptr(eb); 1705 1703 1706 - err_relock: 1707 1704 i915_gem_ww_ctx_init(&eb->ww, true); 1708 1705 if (err) 1709 1706 goto out; 1710 1707 1711 1708 /* reacquire the objects */ 1712 1709 repeat_validate: 1713 - rq = eb_pin_engine(eb, false); 1714 - if (IS_ERR(rq)) { 1715 - err = PTR_ERR(rq); 1716 - rq = NULL; 1710 + err = eb_pin_engine(eb, false); 1711 + if (err) 1717 1712 goto err; 1718 - } 1719 - 1720 - /* We didn't throttle, should be NULL */ 1721 - GEM_WARN_ON(rq); 1722 1713 1723 1714 err = eb_validate_vmas(eb); 1724 1715 if (err) 1725 1716 goto err; 1726 1717 1727 - GEM_BUG_ON(!eb->batch); 1718 + GEM_BUG_ON(!eb->batches[0]); 1728 1719 1729 1720 list_for_each_entry(ev, &eb->relocs, reloc_link) { 1730 1721 if (!have_copy) { ··· 1781 1786 } 1782 1787 } 1783 1788 1784 - if (rq) 1785 - i915_request_put(rq); 1786 - 1787 1789 return err; 1788 1790 } 1789 1791 1790 1792 static int eb_relocate_parse(struct i915_execbuffer *eb) 1791 1793 { 1792 1794 int err; 1793 - struct i915_request *rq = NULL; 1794 1795 bool throttle = true; 1795 1796 1796 1797 retry: 1797 - rq = eb_pin_engine(eb, throttle); 1798 - if (IS_ERR(rq)) { 1799 - err = PTR_ERR(rq); 1800 - rq = NULL; 1798 + err = eb_pin_engine(eb, throttle); 1799 + if (err) { 1801 1800 if (err != -EDEADLK) 1802 1801 return err; 1803 1802 1804 1803 goto err; 1805 - } 1806 - 1807 - if (rq) { 1808 - bool nonblock = eb->file->filp->f_flags & O_NONBLOCK; 1809 - 1810 - /* Need to drop all locks now for throttling, take slowpath */ 1811 - err = i915_request_wait(rq, I915_WAIT_INTERRUPTIBLE, 0); 1812 - if (err == -ETIME) { 1813 - if (nonblock) { 1814 - err = -EWOULDBLOCK; 1815 - i915_request_put(rq); 1816 - goto err; 1817 - } 1818 - goto slow; 1819 - } 1820 - i915_request_put(rq); 1821 - rq = NULL; 1822 1804 } 1823 1805 1824 1806 /* only throttle once, even if we didn't need to throttle */ ··· 1837 1865 return err; 1838 1866 1839 1867 slow: 1840 - err = eb_relocate_parse_slow(eb, rq); 1868 + err = eb_relocate_parse_slow(eb); 1841 1869 if (err) 1842 1870 /* 1843 1871 * If the user expects the execobject.offset and ··· 1851 1879 return err; 1852 1880 } 1853 1881 1882 + /* 1883 + * Using two helper loops for the order of which requests / batches are created 1884 + * and added the to backend. Requests are created in order from the parent to 1885 + * the last child. Requests are added in the reverse order, from the last child 1886 + * to parent. This is done for locking reasons as the timeline lock is acquired 1887 + * during request creation and released when the request is added to the 1888 + * backend. To make lockdep happy (see intel_context_timeline_lock) this must be 1889 + * the ordering. 1890 + */ 1891 + #define for_each_batch_create_order(_eb, _i) \ 1892 + for ((_i) = 0; (_i) < (_eb)->num_batches; ++(_i)) 1893 + #define for_each_batch_add_order(_eb, _i) \ 1894 + BUILD_BUG_ON(!typecheck(int, _i)); \ 1895 + for ((_i) = (_eb)->num_batches - 1; (_i) >= 0; --(_i)) 1896 + 1897 + static struct i915_request * 1898 + eb_find_first_request_added(struct i915_execbuffer *eb) 1899 + { 1900 + int i; 1901 + 1902 + for_each_batch_add_order(eb, i) 1903 + if (eb->requests[i]) 1904 + return eb->requests[i]; 1905 + 1906 + GEM_BUG_ON("Request not found"); 1907 + 1908 + return NULL; 1909 + } 1910 + 1854 1911 static int eb_move_to_gpu(struct i915_execbuffer *eb) 1855 1912 { 1856 1913 const unsigned int count = eb->buffer_count; 1857 1914 unsigned int i = count; 1858 - int err = 0; 1915 + int err = 0, j; 1859 1916 1860 1917 while (i--) { 1861 1918 struct eb_vma *ev = &eb->vma[i]; ··· 1897 1896 if (flags & EXEC_OBJECT_CAPTURE) { 1898 1897 struct i915_capture_list *capture; 1899 1898 1900 - capture = kmalloc(sizeof(*capture), GFP_KERNEL); 1901 - if (capture) { 1902 - capture->next = eb->request->capture_list; 1903 - capture->vma = vma; 1904 - eb->request->capture_list = capture; 1899 + for_each_batch_create_order(eb, j) { 1900 + if (!eb->requests[j]) 1901 + break; 1902 + 1903 + capture = kmalloc(sizeof(*capture), GFP_KERNEL); 1904 + if (capture) { 1905 + capture->next = 1906 + eb->requests[j]->capture_list; 1907 + capture->vma = vma; 1908 + eb->requests[j]->capture_list = capture; 1909 + } 1905 1910 } 1906 1911 } 1907 1912 ··· 1922 1915 * !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ) 1923 1916 * but gcc's optimiser doesn't handle that as well and emits 1924 1917 * two jumps instead of one. Maybe one day... 1918 + * 1919 + * FIXME: There is also sync flushing in set_pages(), which 1920 + * serves a different purpose(some of the time at least). 1921 + * 1922 + * We should consider: 1923 + * 1924 + * 1. Rip out the async flush code. 1925 + * 1926 + * 2. Or make the sync flushing use the async clflush path 1927 + * using mandatory fences underneath. Currently the below 1928 + * async flush happens after we bind the object. 1925 1929 */ 1926 1930 if (unlikely(obj->cache_dirty & ~obj->cache_coherent)) { 1927 1931 if (i915_gem_clflush_object(obj, 0)) 1928 1932 flags &= ~EXEC_OBJECT_ASYNC; 1929 1933 } 1930 1934 1935 + /* We only need to await on the first request */ 1931 1936 if (err == 0 && !(flags & EXEC_OBJECT_ASYNC)) { 1932 1937 err = i915_request_await_object 1933 - (eb->request, obj, flags & EXEC_OBJECT_WRITE); 1938 + (eb_find_first_request_added(eb), obj, 1939 + flags & EXEC_OBJECT_WRITE); 1934 1940 } 1935 1941 1936 - if (err == 0) 1937 - err = i915_vma_move_to_active(vma, eb->request, 1938 - flags | __EXEC_OBJECT_NO_RESERVE); 1942 + for_each_batch_add_order(eb, j) { 1943 + if (err) 1944 + break; 1945 + if (!eb->requests[j]) 1946 + continue; 1947 + 1948 + err = _i915_vma_move_to_active(vma, eb->requests[j], 1949 + j ? NULL : 1950 + eb->composite_fence ? 1951 + eb->composite_fence : 1952 + &eb->requests[j]->fence, 1953 + flags | __EXEC_OBJECT_NO_RESERVE); 1954 + } 1939 1955 } 1940 1956 1941 1957 #ifdef CONFIG_MMU_NOTIFIER ··· 1989 1959 goto err_skip; 1990 1960 1991 1961 /* Unconditionally flush any chipset caches (for streaming writes). */ 1992 - intel_gt_chipset_flush(eb->engine->gt); 1962 + intel_gt_chipset_flush(eb->gt); 1993 1963 return 0; 1994 1964 1995 1965 err_skip: 1996 - i915_request_set_error_once(eb->request, err); 1966 + for_each_batch_create_order(eb, j) { 1967 + if (!eb->requests[j]) 1968 + break; 1969 + 1970 + i915_request_set_error_once(eb->requests[j], err); 1971 + } 1997 1972 return err; 1998 1973 } 1999 1974 ··· 2093 2058 int err; 2094 2059 2095 2060 if (!eb_use_cmdparser(eb)) { 2096 - batch = eb_dispatch_secure(eb, eb->batch->vma); 2061 + batch = eb_dispatch_secure(eb, eb->batches[0]->vma); 2097 2062 if (IS_ERR(batch)) 2098 2063 return PTR_ERR(batch); 2099 2064 2100 2065 goto secure_batch; 2101 2066 } 2102 2067 2103 - len = eb->batch_len; 2068 + if (intel_context_is_parallel(eb->context)) 2069 + return -EINVAL; 2070 + 2071 + len = eb->batch_len[0]; 2104 2072 if (!CMDPARSER_USES_GGTT(eb->i915)) { 2105 2073 /* 2106 2074 * ppGTT backed shadow buffers must be mapped RO, to prevent ··· 2117 2079 } else { 2118 2080 len += I915_CMD_PARSER_TRAMPOLINE_SIZE; 2119 2081 } 2120 - if (unlikely(len < eb->batch_len)) /* last paranoid check of overflow */ 2082 + if (unlikely(len < eb->batch_len[0])) /* last paranoid check of overflow */ 2121 2083 return -EINVAL; 2122 2084 2123 2085 if (!pool) { 2124 - pool = intel_gt_get_buffer_pool(eb->engine->gt, len, 2086 + pool = intel_gt_get_buffer_pool(eb->gt, len, 2125 2087 I915_MAP_WB); 2126 2088 if (IS_ERR(pool)) 2127 2089 return PTR_ERR(pool); ··· 2146 2108 trampoline = shadow; 2147 2109 2148 2110 shadow = shadow_batch_pin(eb, pool->obj, 2149 - &eb->engine->gt->ggtt->vm, 2111 + &eb->gt->ggtt->vm, 2150 2112 PIN_GLOBAL); 2151 2113 if (IS_ERR(shadow)) { 2152 2114 err = PTR_ERR(shadow); ··· 2168 2130 if (err) 2169 2131 goto err_trampoline; 2170 2132 2171 - err = intel_engine_cmd_parser(eb->engine, 2172 - eb->batch->vma, 2133 + err = intel_engine_cmd_parser(eb->context->engine, 2134 + eb->batches[0]->vma, 2173 2135 eb->batch_start_offset, 2174 - eb->batch_len, 2136 + eb->batch_len[0], 2175 2137 shadow, trampoline); 2176 2138 if (err) 2177 2139 goto err_unpin_batch; 2178 2140 2179 - eb->batch = &eb->vma[eb->buffer_count++]; 2180 - eb->batch->vma = i915_vma_get(shadow); 2181 - eb->batch->flags = __EXEC_OBJECT_HAS_PIN; 2141 + eb->batches[0] = &eb->vma[eb->buffer_count++]; 2142 + eb->batches[0]->vma = i915_vma_get(shadow); 2143 + eb->batches[0]->flags = __EXEC_OBJECT_HAS_PIN; 2182 2144 2183 2145 eb->trampoline = trampoline; 2184 2146 eb->batch_start_offset = 0; 2185 2147 2186 2148 secure_batch: 2187 2149 if (batch) { 2188 - eb->batch = &eb->vma[eb->buffer_count++]; 2189 - eb->batch->flags = __EXEC_OBJECT_HAS_PIN; 2190 - eb->batch->vma = i915_vma_get(batch); 2150 + if (intel_context_is_parallel(eb->context)) 2151 + return -EINVAL; 2152 + 2153 + eb->batches[0] = &eb->vma[eb->buffer_count++]; 2154 + eb->batches[0]->flags = __EXEC_OBJECT_HAS_PIN; 2155 + eb->batches[0]->vma = i915_vma_get(batch); 2191 2156 } 2192 2157 return 0; 2193 2158 ··· 2206 2165 return err; 2207 2166 } 2208 2167 2209 - static int eb_submit(struct i915_execbuffer *eb, struct i915_vma *batch) 2168 + static int eb_request_submit(struct i915_execbuffer *eb, 2169 + struct i915_request *rq, 2170 + struct i915_vma *batch, 2171 + u64 batch_len) 2210 2172 { 2211 2173 int err; 2212 2174 2213 - if (intel_context_nopreempt(eb->context)) 2214 - __set_bit(I915_FENCE_FLAG_NOPREEMPT, &eb->request->fence.flags); 2215 - 2216 - err = eb_move_to_gpu(eb); 2217 - if (err) 2218 - return err; 2175 + if (intel_context_nopreempt(rq->context)) 2176 + __set_bit(I915_FENCE_FLAG_NOPREEMPT, &rq->fence.flags); 2219 2177 2220 2178 if (eb->args->flags & I915_EXEC_GEN7_SOL_RESET) { 2221 - err = i915_reset_gen7_sol_offsets(eb->request); 2179 + err = i915_reset_gen7_sol_offsets(rq); 2222 2180 if (err) 2223 2181 return err; 2224 2182 } ··· 2228 2188 * allows us to determine if the batch is still waiting on the GPU 2229 2189 * or actually running by checking the breadcrumb. 2230 2190 */ 2231 - if (eb->engine->emit_init_breadcrumb) { 2232 - err = eb->engine->emit_init_breadcrumb(eb->request); 2191 + if (rq->context->engine->emit_init_breadcrumb) { 2192 + err = rq->context->engine->emit_init_breadcrumb(rq); 2233 2193 if (err) 2234 2194 return err; 2235 2195 } 2236 2196 2237 - err = eb->engine->emit_bb_start(eb->request, 2238 - batch->node.start + 2239 - eb->batch_start_offset, 2240 - eb->batch_len, 2241 - eb->batch_flags); 2197 + err = rq->context->engine->emit_bb_start(rq, 2198 + batch->node.start + 2199 + eb->batch_start_offset, 2200 + batch_len, 2201 + eb->batch_flags); 2242 2202 if (err) 2243 2203 return err; 2244 2204 2245 2205 if (eb->trampoline) { 2206 + GEM_BUG_ON(intel_context_is_parallel(rq->context)); 2246 2207 GEM_BUG_ON(eb->batch_start_offset); 2247 - err = eb->engine->emit_bb_start(eb->request, 2248 - eb->trampoline->node.start + 2249 - eb->batch_len, 2250 - 0, 0); 2208 + err = rq->context->engine->emit_bb_start(rq, 2209 + eb->trampoline->node.start + 2210 + batch_len, 0, 0); 2251 2211 if (err) 2252 2212 return err; 2253 2213 } 2254 2214 2255 2215 return 0; 2216 + } 2217 + 2218 + static int eb_submit(struct i915_execbuffer *eb) 2219 + { 2220 + unsigned int i; 2221 + int err; 2222 + 2223 + err = eb_move_to_gpu(eb); 2224 + 2225 + for_each_batch_create_order(eb, i) { 2226 + if (!eb->requests[i]) 2227 + break; 2228 + 2229 + trace_i915_request_queue(eb->requests[i], eb->batch_flags); 2230 + if (!err) 2231 + err = eb_request_submit(eb, eb->requests[i], 2232 + eb->batches[i]->vma, 2233 + eb->batch_len[i]); 2234 + } 2235 + 2236 + return err; 2256 2237 } 2257 2238 2258 2239 static int num_vcs_engines(const struct drm_i915_private *i915) ··· 2341 2280 return i915_request_get(rq); 2342 2281 } 2343 2282 2344 - static struct i915_request *eb_pin_engine(struct i915_execbuffer *eb, bool throttle) 2283 + static int eb_pin_timeline(struct i915_execbuffer *eb, struct intel_context *ce, 2284 + bool throttle) 2345 2285 { 2346 - struct intel_context *ce = eb->context; 2347 2286 struct intel_timeline *tl; 2348 2287 struct i915_request *rq = NULL; 2349 - int err; 2350 - 2351 - GEM_BUG_ON(eb->args->flags & __EXEC_ENGINE_PINNED); 2352 - 2353 - if (unlikely(intel_context_is_banned(ce))) 2354 - return ERR_PTR(-EIO); 2355 - 2356 - /* 2357 - * Pinning the contexts may generate requests in order to acquire 2358 - * GGTT space, so do this first before we reserve a seqno for 2359 - * ourselves. 2360 - */ 2361 - err = intel_context_pin_ww(ce, &eb->ww); 2362 - if (err) 2363 - return ERR_PTR(err); 2364 2288 2365 2289 /* 2366 2290 * Take a local wakeref for preparing to dispatch the execbuf as ··· 2356 2310 * taken on the engine, and the parent device. 2357 2311 */ 2358 2312 tl = intel_context_timeline_lock(ce); 2359 - if (IS_ERR(tl)) { 2360 - intel_context_unpin(ce); 2361 - return ERR_CAST(tl); 2362 - } 2313 + if (IS_ERR(tl)) 2314 + return PTR_ERR(tl); 2363 2315 2364 2316 intel_context_enter(ce); 2365 2317 if (throttle) 2366 2318 rq = eb_throttle(eb, ce); 2367 2319 intel_context_timeline_unlock(tl); 2368 2320 2321 + if (rq) { 2322 + bool nonblock = eb->file->filp->f_flags & O_NONBLOCK; 2323 + long timeout = nonblock ? 0 : MAX_SCHEDULE_TIMEOUT; 2324 + 2325 + if (i915_request_wait(rq, I915_WAIT_INTERRUPTIBLE, 2326 + timeout) < 0) { 2327 + i915_request_put(rq); 2328 + 2329 + tl = intel_context_timeline_lock(ce); 2330 + intel_context_exit(ce); 2331 + intel_context_timeline_unlock(tl); 2332 + 2333 + if (nonblock) 2334 + return -EWOULDBLOCK; 2335 + else 2336 + return -EINTR; 2337 + } 2338 + i915_request_put(rq); 2339 + } 2340 + 2341 + return 0; 2342 + } 2343 + 2344 + static int eb_pin_engine(struct i915_execbuffer *eb, bool throttle) 2345 + { 2346 + struct intel_context *ce = eb->context, *child; 2347 + int err; 2348 + int i = 0, j = 0; 2349 + 2350 + GEM_BUG_ON(eb->args->flags & __EXEC_ENGINE_PINNED); 2351 + 2352 + if (unlikely(intel_context_is_banned(ce))) 2353 + return -EIO; 2354 + 2355 + /* 2356 + * Pinning the contexts may generate requests in order to acquire 2357 + * GGTT space, so do this first before we reserve a seqno for 2358 + * ourselves. 2359 + */ 2360 + err = intel_context_pin_ww(ce, &eb->ww); 2361 + if (err) 2362 + return err; 2363 + for_each_child(ce, child) { 2364 + err = intel_context_pin_ww(child, &eb->ww); 2365 + GEM_BUG_ON(err); /* perma-pinned should incr a counter */ 2366 + } 2367 + 2368 + for_each_child(ce, child) { 2369 + err = eb_pin_timeline(eb, child, throttle); 2370 + if (err) 2371 + goto unwind; 2372 + ++i; 2373 + } 2374 + err = eb_pin_timeline(eb, ce, throttle); 2375 + if (err) 2376 + goto unwind; 2377 + 2369 2378 eb->args->flags |= __EXEC_ENGINE_PINNED; 2370 - return rq; 2379 + return 0; 2380 + 2381 + unwind: 2382 + for_each_child(ce, child) { 2383 + if (j++ < i) { 2384 + mutex_lock(&child->timeline->mutex); 2385 + intel_context_exit(child); 2386 + mutex_unlock(&child->timeline->mutex); 2387 + } 2388 + } 2389 + for_each_child(ce, child) 2390 + intel_context_unpin(child); 2391 + intel_context_unpin(ce); 2392 + return err; 2371 2393 } 2372 2394 2373 2395 static void eb_unpin_engine(struct i915_execbuffer *eb) 2374 2396 { 2375 - struct intel_context *ce = eb->context; 2376 - struct intel_timeline *tl = ce->timeline; 2397 + struct intel_context *ce = eb->context, *child; 2377 2398 2378 2399 if (!(eb->args->flags & __EXEC_ENGINE_PINNED)) 2379 2400 return; 2380 2401 2381 2402 eb->args->flags &= ~__EXEC_ENGINE_PINNED; 2382 2403 2383 - mutex_lock(&tl->mutex); 2404 + for_each_child(ce, child) { 2405 + mutex_lock(&child->timeline->mutex); 2406 + intel_context_exit(child); 2407 + mutex_unlock(&child->timeline->mutex); 2408 + 2409 + intel_context_unpin(child); 2410 + } 2411 + 2412 + mutex_lock(&ce->timeline->mutex); 2384 2413 intel_context_exit(ce); 2385 - mutex_unlock(&tl->mutex); 2414 + mutex_unlock(&ce->timeline->mutex); 2386 2415 2387 2416 intel_context_unpin(ce); 2388 2417 } ··· 2508 2387 static int 2509 2388 eb_select_engine(struct i915_execbuffer *eb) 2510 2389 { 2511 - struct intel_context *ce; 2390 + struct intel_context *ce, *child; 2512 2391 unsigned int idx; 2513 2392 int err; 2514 2393 ··· 2521 2400 if (IS_ERR(ce)) 2522 2401 return PTR_ERR(ce); 2523 2402 2403 + if (intel_context_is_parallel(ce)) { 2404 + if (eb->buffer_count < ce->parallel.number_children + 1) { 2405 + intel_context_put(ce); 2406 + return -EINVAL; 2407 + } 2408 + if (eb->batch_start_offset || eb->args->batch_len) { 2409 + intel_context_put(ce); 2410 + return -EINVAL; 2411 + } 2412 + } 2413 + eb->num_batches = ce->parallel.number_children + 1; 2414 + 2415 + for_each_child(ce, child) 2416 + intel_context_get(child); 2524 2417 intel_gt_pm_get(ce->engine->gt); 2525 2418 2526 2419 if (!test_bit(CONTEXT_ALLOC_BIT, &ce->flags)) { 2527 2420 err = intel_context_alloc_state(ce); 2528 2421 if (err) 2529 2422 goto err; 2423 + } 2424 + for_each_child(ce, child) { 2425 + if (!test_bit(CONTEXT_ALLOC_BIT, &child->flags)) { 2426 + err = intel_context_alloc_state(child); 2427 + if (err) 2428 + goto err; 2429 + } 2530 2430 } 2531 2431 2532 2432 /* ··· 2559 2417 goto err; 2560 2418 2561 2419 eb->context = ce; 2562 - eb->engine = ce->engine; 2420 + eb->gt = ce->engine->gt; 2563 2421 2564 2422 /* 2565 2423 * Make sure engine pool stays alive even if we call intel_context_put ··· 2570 2428 2571 2429 err: 2572 2430 intel_gt_pm_put(ce->engine->gt); 2431 + for_each_child(ce, child) 2432 + intel_context_put(child); 2573 2433 intel_context_put(ce); 2574 2434 return err; 2575 2435 } ··· 2579 2435 static void 2580 2436 eb_put_engine(struct i915_execbuffer *eb) 2581 2437 { 2582 - intel_gt_pm_put(eb->engine->gt); 2438 + struct intel_context *child; 2439 + 2440 + intel_gt_pm_put(eb->gt); 2441 + for_each_child(eb->context, child) 2442 + intel_context_put(child); 2583 2443 intel_context_put(eb->context); 2584 2444 } 2585 2445 ··· 2806 2658 } 2807 2659 2808 2660 static int 2809 - await_fence_array(struct i915_execbuffer *eb) 2661 + await_fence_array(struct i915_execbuffer *eb, 2662 + struct i915_request *rq) 2810 2663 { 2811 2664 unsigned int n; 2812 2665 int err; ··· 2821 2672 if (!eb->fences[n].dma_fence) 2822 2673 continue; 2823 2674 2824 - err = i915_request_await_dma_fence(eb->request, 2825 - eb->fences[n].dma_fence); 2675 + err = i915_request_await_dma_fence(rq, eb->fences[n].dma_fence); 2826 2676 if (err < 0) 2827 2677 return err; 2828 2678 } ··· 2829 2681 return 0; 2830 2682 } 2831 2683 2832 - static void signal_fence_array(const struct i915_execbuffer *eb) 2684 + static void signal_fence_array(const struct i915_execbuffer *eb, 2685 + struct dma_fence * const fence) 2833 2686 { 2834 - struct dma_fence * const fence = &eb->request->fence; 2835 2687 unsigned int n; 2836 2688 2837 2689 for (n = 0; n < eb->num_fences; n++) { ··· 2879 2731 break; 2880 2732 } 2881 2733 2882 - static int eb_request_add(struct i915_execbuffer *eb, int err) 2734 + static int eb_request_add(struct i915_execbuffer *eb, struct i915_request *rq, 2735 + int err, bool last_parallel) 2883 2736 { 2884 - struct i915_request *rq = eb->request; 2885 2737 struct intel_timeline * const tl = i915_request_timeline(rq); 2886 2738 struct i915_sched_attr attr = {}; 2887 2739 struct i915_request *prev; ··· 2903 2755 err = -ENOENT; /* override any transient errors */ 2904 2756 } 2905 2757 2758 + if (intel_context_is_parallel(eb->context)) { 2759 + if (err) { 2760 + __i915_request_skip(rq); 2761 + set_bit(I915_FENCE_FLAG_SKIP_PARALLEL, 2762 + &rq->fence.flags); 2763 + } 2764 + if (last_parallel) 2765 + set_bit(I915_FENCE_FLAG_SUBMIT_PARALLEL, 2766 + &rq->fence.flags); 2767 + } 2768 + 2906 2769 __i915_request_queue(rq, &attr); 2907 2770 2908 2771 /* Try to clean up the client's timeline after submitting the request */ ··· 2921 2762 retire_requests(tl, prev); 2922 2763 2923 2764 mutex_unlock(&tl->mutex); 2765 + 2766 + return err; 2767 + } 2768 + 2769 + static int eb_requests_add(struct i915_execbuffer *eb, int err) 2770 + { 2771 + int i; 2772 + 2773 + /* 2774 + * We iterate in reverse order of creation to release timeline mutexes in 2775 + * same order. 2776 + */ 2777 + for_each_batch_add_order(eb, i) { 2778 + struct i915_request *rq = eb->requests[i]; 2779 + 2780 + if (!rq) 2781 + continue; 2782 + err |= eb_request_add(eb, rq, err, i == 0); 2783 + } 2924 2784 2925 2785 return err; 2926 2786 } ··· 2970 2792 eb); 2971 2793 } 2972 2794 2795 + static void eb_requests_get(struct i915_execbuffer *eb) 2796 + { 2797 + unsigned int i; 2798 + 2799 + for_each_batch_create_order(eb, i) { 2800 + if (!eb->requests[i]) 2801 + break; 2802 + 2803 + i915_request_get(eb->requests[i]); 2804 + } 2805 + } 2806 + 2807 + static void eb_requests_put(struct i915_execbuffer *eb) 2808 + { 2809 + unsigned int i; 2810 + 2811 + for_each_batch_create_order(eb, i) { 2812 + if (!eb->requests[i]) 2813 + break; 2814 + 2815 + i915_request_put(eb->requests[i]); 2816 + } 2817 + } 2818 + 2819 + static struct sync_file * 2820 + eb_composite_fence_create(struct i915_execbuffer *eb, int out_fence_fd) 2821 + { 2822 + struct sync_file *out_fence = NULL; 2823 + struct dma_fence_array *fence_array; 2824 + struct dma_fence **fences; 2825 + unsigned int i; 2826 + 2827 + GEM_BUG_ON(!intel_context_is_parent(eb->context)); 2828 + 2829 + fences = kmalloc_array(eb->num_batches, sizeof(*fences), GFP_KERNEL); 2830 + if (!fences) 2831 + return ERR_PTR(-ENOMEM); 2832 + 2833 + for_each_batch_create_order(eb, i) { 2834 + fences[i] = &eb->requests[i]->fence; 2835 + __set_bit(I915_FENCE_FLAG_COMPOSITE, 2836 + &eb->requests[i]->fence.flags); 2837 + } 2838 + 2839 + fence_array = dma_fence_array_create(eb->num_batches, 2840 + fences, 2841 + eb->context->parallel.fence_context, 2842 + eb->context->parallel.seqno, 2843 + false); 2844 + if (!fence_array) { 2845 + kfree(fences); 2846 + return ERR_PTR(-ENOMEM); 2847 + } 2848 + 2849 + /* Move ownership to the dma_fence_array created above */ 2850 + for_each_batch_create_order(eb, i) 2851 + dma_fence_get(fences[i]); 2852 + 2853 + if (out_fence_fd != -1) { 2854 + out_fence = sync_file_create(&fence_array->base); 2855 + /* sync_file now owns fence_arry, drop creation ref */ 2856 + dma_fence_put(&fence_array->base); 2857 + if (!out_fence) 2858 + return ERR_PTR(-ENOMEM); 2859 + } 2860 + 2861 + eb->composite_fence = &fence_array->base; 2862 + 2863 + return out_fence; 2864 + } 2865 + 2866 + static struct sync_file * 2867 + eb_fences_add(struct i915_execbuffer *eb, struct i915_request *rq, 2868 + struct dma_fence *in_fence, int out_fence_fd) 2869 + { 2870 + struct sync_file *out_fence = NULL; 2871 + int err; 2872 + 2873 + if (unlikely(eb->gem_context->syncobj)) { 2874 + struct dma_fence *fence; 2875 + 2876 + fence = drm_syncobj_fence_get(eb->gem_context->syncobj); 2877 + err = i915_request_await_dma_fence(rq, fence); 2878 + dma_fence_put(fence); 2879 + if (err) 2880 + return ERR_PTR(err); 2881 + } 2882 + 2883 + if (in_fence) { 2884 + if (eb->args->flags & I915_EXEC_FENCE_SUBMIT) 2885 + err = i915_request_await_execution(rq, in_fence); 2886 + else 2887 + err = i915_request_await_dma_fence(rq, in_fence); 2888 + if (err < 0) 2889 + return ERR_PTR(err); 2890 + } 2891 + 2892 + if (eb->fences) { 2893 + err = await_fence_array(eb, rq); 2894 + if (err) 2895 + return ERR_PTR(err); 2896 + } 2897 + 2898 + if (intel_context_is_parallel(eb->context)) { 2899 + out_fence = eb_composite_fence_create(eb, out_fence_fd); 2900 + if (IS_ERR(out_fence)) 2901 + return ERR_PTR(-ENOMEM); 2902 + } else if (out_fence_fd != -1) { 2903 + out_fence = sync_file_create(&rq->fence); 2904 + if (!out_fence) 2905 + return ERR_PTR(-ENOMEM); 2906 + } 2907 + 2908 + return out_fence; 2909 + } 2910 + 2911 + static struct intel_context * 2912 + eb_find_context(struct i915_execbuffer *eb, unsigned int context_number) 2913 + { 2914 + struct intel_context *child; 2915 + 2916 + if (likely(context_number == 0)) 2917 + return eb->context; 2918 + 2919 + for_each_child(eb->context, child) 2920 + if (!--context_number) 2921 + return child; 2922 + 2923 + GEM_BUG_ON("Context not found"); 2924 + 2925 + return NULL; 2926 + } 2927 + 2928 + static struct sync_file * 2929 + eb_requests_create(struct i915_execbuffer *eb, struct dma_fence *in_fence, 2930 + int out_fence_fd) 2931 + { 2932 + struct sync_file *out_fence = NULL; 2933 + unsigned int i; 2934 + 2935 + for_each_batch_create_order(eb, i) { 2936 + /* Allocate a request for this batch buffer nice and early. */ 2937 + eb->requests[i] = i915_request_create(eb_find_context(eb, i)); 2938 + if (IS_ERR(eb->requests[i])) { 2939 + out_fence = ERR_PTR(PTR_ERR(eb->requests[i])); 2940 + eb->requests[i] = NULL; 2941 + return out_fence; 2942 + } 2943 + 2944 + /* 2945 + * Only the first request added (committed to backend) has to 2946 + * take the in fences into account as all subsequent requests 2947 + * will have fences inserted inbetween them. 2948 + */ 2949 + if (i + 1 == eb->num_batches) { 2950 + out_fence = eb_fences_add(eb, eb->requests[i], 2951 + in_fence, out_fence_fd); 2952 + if (IS_ERR(out_fence)) 2953 + return out_fence; 2954 + } 2955 + 2956 + /* 2957 + * Whilst this request exists, batch_obj will be on the 2958 + * active_list, and so will hold the active reference. Only when 2959 + * this request is retired will the batch_obj be moved onto 2960 + * the inactive_list and lose its active reference. Hence we do 2961 + * not need to explicitly hold another reference here. 2962 + */ 2963 + eb->requests[i]->batch = eb->batches[i]->vma; 2964 + if (eb->batch_pool) { 2965 + GEM_BUG_ON(intel_context_is_parallel(eb->context)); 2966 + intel_gt_buffer_pool_mark_active(eb->batch_pool, 2967 + eb->requests[i]); 2968 + } 2969 + } 2970 + 2971 + return out_fence; 2972 + } 2973 + 2973 2974 static int 2974 2975 i915_gem_do_execbuffer(struct drm_device *dev, 2975 2976 struct drm_file *file, ··· 3159 2802 struct i915_execbuffer eb; 3160 2803 struct dma_fence *in_fence = NULL; 3161 2804 struct sync_file *out_fence = NULL; 3162 - struct i915_vma *batch; 3163 2805 int out_fence_fd = -1; 3164 2806 int err; 3165 2807 ··· 3182 2826 3183 2827 eb.buffer_count = args->buffer_count; 3184 2828 eb.batch_start_offset = args->batch_start_offset; 3185 - eb.batch_len = args->batch_len; 3186 2829 eb.trampoline = NULL; 3187 2830 3188 2831 eb.fences = NULL; 3189 2832 eb.num_fences = 0; 2833 + 2834 + memset(eb.requests, 0, sizeof(struct i915_request *) * 2835 + ARRAY_SIZE(eb.requests)); 2836 + eb.composite_fence = NULL; 3190 2837 3191 2838 eb.batch_flags = 0; 3192 2839 if (args->flags & I915_EXEC_SECURE) { ··· 3274 2915 3275 2916 ww_acquire_done(&eb.ww.ctx); 3276 2917 3277 - batch = eb.batch->vma; 3278 - 3279 - /* Allocate a request for this batch buffer nice and early. */ 3280 - eb.request = i915_request_create(eb.context); 3281 - if (IS_ERR(eb.request)) { 3282 - err = PTR_ERR(eb.request); 3283 - goto err_vma; 3284 - } 3285 - 3286 - if (unlikely(eb.gem_context->syncobj)) { 3287 - struct dma_fence *fence; 3288 - 3289 - fence = drm_syncobj_fence_get(eb.gem_context->syncobj); 3290 - err = i915_request_await_dma_fence(eb.request, fence); 3291 - dma_fence_put(fence); 3292 - if (err) 3293 - goto err_ext; 3294 - } 3295 - 3296 - if (in_fence) { 3297 - if (args->flags & I915_EXEC_FENCE_SUBMIT) 3298 - err = i915_request_await_execution(eb.request, 3299 - in_fence); 2918 + out_fence = eb_requests_create(&eb, in_fence, out_fence_fd); 2919 + if (IS_ERR(out_fence)) { 2920 + err = PTR_ERR(out_fence); 2921 + if (eb.requests[0]) 2922 + goto err_request; 3300 2923 else 3301 - err = i915_request_await_dma_fence(eb.request, 3302 - in_fence); 3303 - if (err < 0) 3304 - goto err_request; 2924 + goto err_vma; 3305 2925 } 3306 2926 3307 - if (eb.fences) { 3308 - err = await_fence_array(&eb); 3309 - if (err) 3310 - goto err_request; 3311 - } 3312 - 3313 - if (out_fence_fd != -1) { 3314 - out_fence = sync_file_create(&eb.request->fence); 3315 - if (!out_fence) { 3316 - err = -ENOMEM; 3317 - goto err_request; 3318 - } 3319 - } 3320 - 3321 - /* 3322 - * Whilst this request exists, batch_obj will be on the 3323 - * active_list, and so will hold the active reference. Only when this 3324 - * request is retired will the the batch_obj be moved onto the 3325 - * inactive_list and lose its active reference. Hence we do not need 3326 - * to explicitly hold another reference here. 3327 - */ 3328 - eb.request->batch = batch; 3329 - if (eb.batch_pool) 3330 - intel_gt_buffer_pool_mark_active(eb.batch_pool, eb.request); 3331 - 3332 - trace_i915_request_queue(eb.request, eb.batch_flags); 3333 - err = eb_submit(&eb, batch); 2927 + err = eb_submit(&eb); 3334 2928 3335 2929 err_request: 3336 - i915_request_get(eb.request); 3337 - err = eb_request_add(&eb, err); 2930 + eb_requests_get(&eb); 2931 + err = eb_requests_add(&eb, err); 3338 2932 3339 2933 if (eb.fences) 3340 - signal_fence_array(&eb); 2934 + signal_fence_array(&eb, eb.composite_fence ? 2935 + eb.composite_fence : 2936 + &eb.requests[0]->fence); 3341 2937 3342 2938 if (out_fence) { 3343 2939 if (err == 0) { ··· 3307 2993 3308 2994 if (unlikely(eb.gem_context->syncobj)) { 3309 2995 drm_syncobj_replace_fence(eb.gem_context->syncobj, 3310 - &eb.request->fence); 2996 + eb.composite_fence ? 2997 + eb.composite_fence : 2998 + &eb.requests[0]->fence); 3311 2999 } 3312 3000 3313 - i915_request_put(eb.request); 3001 + if (!out_fence && eb.composite_fence) 3002 + dma_fence_put(eb.composite_fence); 3003 + 3004 + eb_requests_put(&eb); 3314 3005 3315 3006 err_vma: 3316 3007 eb_release_vmas(&eb, true);

+2

drivers/gpu/drm/i915/gem/i915_gem_internal.c

··· 134 134 internal_free_pages(pages); 135 135 136 136 obj->mm.dirty = false; 137 + 138 + __start_cpu_write(obj); 137 139 } 138 140 139 141 static const struct drm_i915_gem_object_ops i915_gem_object_internal_ops = {

+26

drivers/gpu/drm/i915/gem/i915_gem_object.c

··· 128 128 !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE); 129 129 } 130 130 131 + bool i915_gem_object_can_bypass_llc(struct drm_i915_gem_object *obj) 132 + { 133 + struct drm_i915_private *i915 = to_i915(obj->base.dev); 134 + 135 + /* 136 + * This is purely from a security perspective, so we simply don't care 137 + * about non-userspace objects being able to bypass the LLC. 138 + */ 139 + if (!(obj->flags & I915_BO_ALLOC_USER)) 140 + return false; 141 + 142 + /* 143 + * EHL and JSL add the 'Bypass LLC' MOCS entry, which should make it 144 + * possible for userspace to bypass the GTT caching bits set by the 145 + * kernel, as per the given object cache_level. This is troublesome 146 + * since the heavy flush we apply when first gathering the pages is 147 + * skipped if the kernel thinks the object is coherent with the GPU. As 148 + * a result it might be possible to bypass the cache and read the 149 + * contents of the page directly, which could be stale data. If it's 150 + * just a case of userspace shooting themselves in the foot then so be 151 + * it, but since i915 takes the stance of always zeroing memory before 152 + * handing it to userspace, we need to prevent this. 153 + */ 154 + return IS_JSL_EHL(i915); 155 + } 156 + 131 157 static void i915_gem_close_object(struct drm_gem_object *gem, struct drm_file *file) 132 158 { 133 159 struct drm_i915_gem_object *obj = to_intel_bo(gem);

+1

drivers/gpu/drm/i915/gem/i915_gem_object.h

··· 514 514 515 515 void i915_gem_object_set_cache_coherency(struct drm_i915_gem_object *obj, 516 516 unsigned int cache_level); 517 + bool i915_gem_object_can_bypass_llc(struct drm_i915_gem_object *obj); 517 518 void i915_gem_object_flush_if_display(struct drm_i915_gem_object *obj); 518 519 void i915_gem_object_flush_if_display_locked(struct drm_i915_gem_object *obj); 519 520

+27

drivers/gpu/drm/i915/gem/i915_gem_object_types.h

··· 427 427 * can freely bypass the CPU cache when touching the pages with the GPU, 428 428 * where the kernel is completely unaware. On such platform we need 429 429 * apply the sledgehammer-on-acquire regardless of the @cache_coherent. 430 + * 431 + * Special care is taken on non-LLC platforms, to prevent potential 432 + * information leak. The driver currently ensures: 433 + * 434 + * 1. All userspace objects, by default, have @cache_level set as 435 + * I915_CACHE_NONE. The only exception is userptr objects, where we 436 + * instead force I915_CACHE_LLC, but we also don't allow userspace to 437 + * ever change the @cache_level for such objects. Another special case 438 + * is dma-buf, which doesn't rely on @cache_dirty, but there we 439 + * always do a forced flush when acquiring the pages, if there is a 440 + * chance that the pages can be read directly from main memory with 441 + * the GPU. 442 + * 443 + * 2. All I915_CACHE_NONE objects have @cache_dirty initially true. 444 + * 445 + * 3. All swapped-out objects(i.e shmem) have @cache_dirty set to 446 + * true. 447 + * 448 + * 4. The @cache_dirty is never freely reset before the initial 449 + * flush, even if userspace adjusts the @cache_level through the 450 + * i915_gem_set_caching_ioctl. 451 + * 452 + * 5. All @cache_dirty objects(including swapped-in) are initially 453 + * flushed with a synchronous call to drm_clflush_sg in 454 + * __i915_gem_object_set_pages. The @cache_dirty can be freely reset 455 + * at this point. All further asynchronous clfushes are never security 456 + * critical, i.e userspace is free to race against itself. 430 457 */ 431 458 unsigned int cache_dirty:1; 432 459

+13 -16

drivers/gpu/drm/i915/gem/i915_gem_shmem.c

··· 182 182 if (i915_gem_object_needs_bit17_swizzle(obj)) 183 183 i915_gem_object_do_bit_17_swizzle(obj, st); 184 184 185 - /* 186 - * EHL and JSL add the 'Bypass LLC' MOCS entry, which should make it 187 - * possible for userspace to bypass the GTT caching bits set by the 188 - * kernel, as per the given object cache_level. This is troublesome 189 - * since the heavy flush we apply when first gathering the pages is 190 - * skipped if the kernel thinks the object is coherent with the GPU. As 191 - * a result it might be possible to bypass the cache and read the 192 - * contents of the page directly, which could be stale data. If it's 193 - * just a case of userspace shooting themselves in the foot then so be 194 - * it, but since i915 takes the stance of always zeroing memory before 195 - * handing it to userspace, we need to prevent this. 196 - * 197 - * By setting cache_dirty here we make the clflush in set_pages 198 - * unconditional on such platforms. 199 - */ 200 - if (IS_JSL_EHL(i915) && obj->flags & I915_BO_ALLOC_USER) 185 + if (i915_gem_object_can_bypass_llc(obj)) 201 186 obj->cache_dirty = true; 202 187 203 188 __i915_gem_object_set_pages(obj, st, sg_page_sizes); ··· 286 301 struct sg_table *pages, 287 302 bool needs_clflush) 288 303 { 304 + struct drm_i915_private *i915 = to_i915(obj->base.dev); 305 + 289 306 GEM_BUG_ON(obj->mm.madv == __I915_MADV_PURGED); 290 307 291 308 if (obj->mm.madv == I915_MADV_DONTNEED) ··· 299 312 drm_clflush_sg(pages); 300 313 301 314 __start_cpu_write(obj); 315 + /* 316 + * On non-LLC platforms, force the flush-on-acquire if this is ever 317 + * swapped-in. Our async flush path is not trust worthy enough yet(and 318 + * happens in the wrong order), and with some tricks it's conceivable 319 + * for userspace to change the cache-level to I915_CACHE_NONE after the 320 + * pages are swapped-in, and since execbuf binds the object before doing 321 + * the async flush, we have a race window. 322 + */ 323 + if (!HAS_LLC(i915)) 324 + obj->cache_dirty = true; 302 325 } 303 326 304 327 void i915_gem_object_put_pages_shmem(struct drm_i915_gem_object *obj, struct sg_table *pages)

+6 -2

drivers/gpu/drm/i915/gem/i915_gem_userptr.c

··· 165 165 goto err; 166 166 } 167 167 168 - sg_page_sizes = i915_sg_dma_sizes(st->sgl); 168 + WARN_ON_ONCE(!(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE)); 169 + if (i915_gem_object_can_bypass_llc(obj)) 170 + obj->cache_dirty = true; 169 171 172 + sg_page_sizes = i915_sg_dma_sizes(st->sgl); 170 173 __i915_gem_object_set_pages(obj, st, sg_page_sizes); 171 174 172 175 return 0; ··· 549 546 return -ENOMEM; 550 547 551 548 drm_gem_private_object_init(dev, &obj->base, args->user_size); 552 - i915_gem_object_init(obj, &i915_gem_userptr_ops, &lock_class, 0); 549 + i915_gem_object_init(obj, &i915_gem_userptr_ops, &lock_class, 550 + I915_BO_ALLOC_USER); 553 551 obj->mem_flags = I915_BO_FLAG_STRUCT_PAGE; 554 552 obj->read_domains = I915_GEM_DOMAIN_CPU; 555 553 obj->write_domain = I915_GEM_DOMAIN_CPU;

+6 -1

drivers/gpu/drm/i915/gem/selftests/huge_pages.c

··· 136 136 huge_pages_free_pages(pages); 137 137 138 138 obj->mm.dirty = false; 139 + 140 + __start_cpu_write(obj); 139 141 } 140 142 141 143 static const struct drm_i915_gem_object_ops huge_page_ops = { ··· 154 152 { 155 153 static struct lock_class_key lock_class; 156 154 struct drm_i915_gem_object *obj; 155 + unsigned int cache_level; 157 156 158 157 GEM_BUG_ON(!size); 159 158 GEM_BUG_ON(!IS_ALIGNED(size, BIT(__ffs(page_mask)))); ··· 176 173 177 174 obj->write_domain = I915_GEM_DOMAIN_CPU; 178 175 obj->read_domains = I915_GEM_DOMAIN_CPU; 179 - obj->cache_level = I915_CACHE_NONE; 176 + 177 + cache_level = HAS_LLC(i915) ? I915_CACHE_LLC : I915_CACHE_NONE; 178 + i915_gem_object_set_cache_coherency(obj, cache_level); 180 179 181 180 obj->mm.page_mask = page_mask; 182 181

+18 -11

drivers/gpu/drm/i915/gem/selftests/i915_gem_client_blt.c

··· 17 17 #include "huge_gem_object.h" 18 18 #include "mock_context.h" 19 19 20 + enum client_tiling { 21 + CLIENT_TILING_LINEAR, 22 + CLIENT_TILING_X, 23 + CLIENT_TILING_Y, 24 + CLIENT_NUM_TILING_TYPES 25 + }; 26 + 20 27 #define WIDTH 512 21 28 #define HEIGHT 32 22 29 23 30 struct blit_buffer { 24 31 struct i915_vma *vma; 25 32 u32 start_val; 26 - u32 tiling; 33 + enum client_tiling tiling; 27 34 }; 28 35 29 36 struct tiled_blits { ··· 60 53 *cs++ = MI_LOAD_REGISTER_IMM(1); 61 54 *cs++ = i915_mmio_reg_offset(BCS_SWCTRL); 62 55 cmd = (BCS_SRC_Y | BCS_DST_Y) << 16; 63 - if (src->tiling == I915_TILING_Y) 56 + if (src->tiling == CLIENT_TILING_Y) 64 57 cmd |= BCS_SRC_Y; 65 - if (dst->tiling == I915_TILING_Y) 58 + if (dst->tiling == CLIENT_TILING_Y) 66 59 cmd |= BCS_DST_Y; 67 60 *cs++ = cmd; 68 61 ··· 179 172 180 173 t->buffers[i].vma = vma; 181 174 t->buffers[i].tiling = 182 - i915_prandom_u32_max_state(I915_TILING_Y + 1, prng); 175 + i915_prandom_u32_max_state(CLIENT_TILING_Y + 1, prng); 183 176 } 184 177 185 178 return 0; ··· 204 197 static u64 tiled_offset(const struct intel_gt *gt, 205 198 u64 v, 206 199 unsigned int stride, 207 - unsigned int tiling) 200 + enum client_tiling tiling) 208 201 { 209 202 unsigned int swizzle; 210 203 u64 x, y; 211 204 212 - if (tiling == I915_TILING_NONE) 205 + if (tiling == CLIENT_TILING_LINEAR) 213 206 return v; 214 207 215 208 y = div64_u64_rem(v, stride, &x); 216 209 217 - if (tiling == I915_TILING_X) { 210 + if (tiling == CLIENT_TILING_X) { 218 211 v = div64_u64_rem(y, 8, &y) * stride * 8; 219 212 v += y * 512; 220 213 v += div64_u64_rem(x, 512, &x) << 12; ··· 251 244 return v; 252 245 } 253 246 254 - static const char *repr_tiling(int tiling) 247 + static const char *repr_tiling(enum client_tiling tiling) 255 248 { 256 249 switch (tiling) { 257 - case I915_TILING_NONE: return "linear"; 258 - case I915_TILING_X: return "X"; 259 - case I915_TILING_Y: return "Y"; 250 + case CLIENT_TILING_LINEAR: return "linear"; 251 + case CLIENT_TILING_X: return "X"; 252 + case CLIENT_TILING_Y: return "Y"; 260 253 default: return "unknown"; 261 254 } 262 255 }

+47 -5

drivers/gpu/drm/i915/gt/intel_context.c

··· 240 240 if (err) 241 241 goto err_post_unpin; 242 242 243 + intel_engine_pm_might_get(ce->engine); 244 + 243 245 if (unlikely(intel_context_is_closed(ce))) { 244 246 err = -ENOENT; 245 247 goto err_unlock; ··· 364 362 return 0; 365 363 } 366 364 367 - static int sw_fence_dummy_notify(struct i915_sw_fence *sf, 368 - enum i915_sw_fence_notify state) 365 + static int __i915_sw_fence_call 366 + sw_fence_dummy_notify(struct i915_sw_fence *sf, enum i915_sw_fence_notify state) 369 367 { 370 368 return NOTIFY_DONE; 371 369 } ··· 401 399 ce->guc_id.id = GUC_INVALID_LRC_ID; 402 400 INIT_LIST_HEAD(&ce->guc_id.link); 403 401 402 + INIT_LIST_HEAD(&ce->destroyed_link); 403 + 404 + INIT_LIST_HEAD(&ce->parallel.child_list); 405 + 404 406 /* 405 407 * Initialize fence to be complete as this is expected to be complete 406 408 * unless there is a pending schedule disable outstanding. ··· 419 413 420 414 void intel_context_fini(struct intel_context *ce) 421 415 { 416 + struct intel_context *child, *next; 417 + 422 418 if (ce->timeline) 423 419 intel_timeline_put(ce->timeline); 424 420 i915_vm_put(ce->vm); 421 + 422 + /* Need to put the creation ref for the children */ 423 + if (intel_context_is_parent(ce)) 424 + for_each_child_safe(ce, child, next) 425 + intel_context_put(child); 425 426 426 427 mutex_destroy(&ce->pin_mutex); 427 428 i915_active_fini(&ce->active); ··· 528 515 529 516 struct i915_request *intel_context_find_active_request(struct intel_context *ce) 530 517 { 518 + struct intel_context *parent = intel_context_to_parent(ce); 531 519 struct i915_request *rq, *active = NULL; 532 520 unsigned long flags; 533 521 534 522 GEM_BUG_ON(!intel_engine_uses_guc(ce->engine)); 535 523 536 - spin_lock_irqsave(&ce->guc_state.lock, flags); 537 - list_for_each_entry_reverse(rq, &ce->guc_state.requests, 524 + /* 525 + * We search the parent list to find an active request on the submitted 526 + * context. The parent list contains the requests for all the contexts 527 + * in the relationship so we have to do a compare of each request's 528 + * context. 529 + */ 530 + spin_lock_irqsave(&parent->guc_state.lock, flags); 531 + list_for_each_entry_reverse(rq, &parent->guc_state.requests, 538 532 sched.link) { 533 + if (rq->context != ce) 534 + continue; 539 535 if (i915_request_completed(rq)) 540 536 break; 541 537 542 538 active = rq; 543 539 } 544 - spin_unlock_irqrestore(&ce->guc_state.lock, flags); 540 + spin_unlock_irqrestore(&parent->guc_state.lock, flags); 545 541 546 542 return active; 543 + } 544 + 545 + void intel_context_bind_parent_child(struct intel_context *parent, 546 + struct intel_context *child) 547 + { 548 + /* 549 + * Callers responsibility to validate that this function is used 550 + * correctly but we use GEM_BUG_ON here ensure that they do. 551 + */ 552 + GEM_BUG_ON(!intel_engine_uses_guc(parent->engine)); 553 + GEM_BUG_ON(intel_context_is_pinned(parent)); 554 + GEM_BUG_ON(intel_context_is_child(parent)); 555 + GEM_BUG_ON(intel_context_is_pinned(child)); 556 + GEM_BUG_ON(intel_context_is_child(child)); 557 + GEM_BUG_ON(intel_context_is_parent(child)); 558 + 559 + parent->parallel.child_index = parent->parallel.number_children++; 560 + list_add_tail(&child->parallel.child_link, 561 + &parent->parallel.child_list); 562 + child->parallel.parent = parent; 547 563 } 548 564 549 565 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)

+55 -1

drivers/gpu/drm/i915/gt/intel_context.h

··· 44 44 int intel_context_reconfigure_sseu(struct intel_context *ce, 45 45 const struct intel_sseu sseu); 46 46 47 + #define PARENT_SCRATCH_SIZE PAGE_SIZE 48 + 49 + static inline bool intel_context_is_child(struct intel_context *ce) 50 + { 51 + return !!ce->parallel.parent; 52 + } 53 + 54 + static inline bool intel_context_is_parent(struct intel_context *ce) 55 + { 56 + return !!ce->parallel.number_children; 57 + } 58 + 59 + static inline bool intel_context_is_pinned(struct intel_context *ce); 60 + 61 + static inline struct intel_context * 62 + intel_context_to_parent(struct intel_context *ce) 63 + { 64 + if (intel_context_is_child(ce)) { 65 + /* 66 + * The parent holds ref count to the child so it is always safe 67 + * for the parent to access the child, but the child has a 68 + * pointer to the parent without a ref. To ensure this is safe 69 + * the child should only access the parent pointer while the 70 + * parent is pinned. 71 + */ 72 + GEM_BUG_ON(!intel_context_is_pinned(ce->parallel.parent)); 73 + 74 + return ce->parallel.parent; 75 + } else { 76 + return ce; 77 + } 78 + } 79 + 80 + static inline bool intel_context_is_parallel(struct intel_context *ce) 81 + { 82 + return intel_context_is_child(ce) || intel_context_is_parent(ce); 83 + } 84 + 85 + void intel_context_bind_parent_child(struct intel_context *parent, 86 + struct intel_context *child); 87 + 88 + #define for_each_child(parent, ce)\ 89 + list_for_each_entry(ce, &(parent)->parallel.child_list,\ 90 + parallel.child_link) 91 + #define for_each_child_safe(parent, ce, cn)\ 92 + list_for_each_entry_safe(ce, cn, &(parent)->parallel.child_list,\ 93 + parallel.child_link) 94 + 47 95 /** 48 96 * intel_context_lock_pinned - Stablises the 'pinned' status of the HW context 49 97 * @ce - the context ··· 241 193 struct intel_timeline *tl = ce->timeline; 242 194 int err; 243 195 244 - err = mutex_lock_interruptible(&tl->mutex); 196 + if (intel_context_is_parent(ce)) 197 + err = mutex_lock_interruptible_nested(&tl->mutex, 0); 198 + else if (intel_context_is_child(ce)) 199 + err = mutex_lock_interruptible_nested(&tl->mutex, 200 + ce->parallel.child_index + 1); 201 + else 202 + err = mutex_lock_interruptible(&tl->mutex); 245 203 if (err) 246 204 return ERR_PTR(err); 247 205

+68 -5

drivers/gpu/drm/i915/gt/intel_context_types.h

··· 55 55 void (*reset)(struct intel_context *ce); 56 56 void (*destroy)(struct kref *kref); 57 57 58 - /* virtual engine/context interface */ 58 + /* virtual/parallel engine/context interface */ 59 59 struct intel_context *(*create_virtual)(struct intel_engine_cs **engine, 60 - unsigned int count); 60 + unsigned int count, 61 + unsigned long flags); 62 + struct intel_context *(*create_parallel)(struct intel_engine_cs **engines, 63 + unsigned int num_siblings, 64 + unsigned int width); 61 65 struct intel_engine_cs *(*get_sibling)(struct intel_engine_cs *engine, 62 66 unsigned int sibling); 63 67 }; ··· 117 113 #define CONTEXT_NOPREEMPT 8 118 114 #define CONTEXT_LRCA_DIRTY 9 119 115 #define CONTEXT_GUC_INIT 10 116 + #define CONTEXT_PERMA_PIN 11 120 117 121 118 struct { 122 119 u64 timeout_us; ··· 202 197 struct { 203 198 /** 204 199 * @id: handle which is used to uniquely identify this context 205 - * with the GuC, protected by guc->contexts_lock 200 + * with the GuC, protected by guc->submission_state.lock 206 201 */ 207 202 u16 id; 208 203 /** 209 204 * @ref: the number of references to the guc_id, when 210 205 * transitioning in and out of zero protected by 211 - * guc->contexts_lock 206 + * guc->submission_state.lock 212 207 */ 213 208 atomic_t ref; 214 209 /** 215 210 * @link: in guc->guc_id_list when the guc_id has no refs but is 216 - * still valid, protected by guc->contexts_lock 211 + * still valid, protected by guc->submission_state.lock 217 212 */ 218 213 struct list_head link; 219 214 } guc_id; 215 + 216 + /** 217 + * @destroyed_link: link in guc->submission_state.destroyed_contexts, in 218 + * list when context is pending to be destroyed (deregistered with the 219 + * GuC), protected by guc->submission_state.lock 220 + */ 221 + struct list_head destroyed_link; 222 + 223 + /** @parallel: sub-structure for parallel submission members */ 224 + struct { 225 + union { 226 + /** 227 + * @child_list: parent's list of children 228 + * contexts, no protection as immutable after context 229 + * creation 230 + */ 231 + struct list_head child_list; 232 + /** 233 + * @child_link: child's link into parent's list of 234 + * children 235 + */ 236 + struct list_head child_link; 237 + }; 238 + /** @parent: pointer to parent if child */ 239 + struct intel_context *parent; 240 + /** 241 + * @last_rq: last request submitted on a parallel context, used 242 + * to insert submit fences between requests in the parallel 243 + * context 244 + */ 245 + struct i915_request *last_rq; 246 + /** 247 + * @fence_context: fence context composite fence when doing 248 + * parallel submission 249 + */ 250 + u64 fence_context; 251 + /** 252 + * @seqno: seqno for composite fence when doing parallel 253 + * submission 254 + */ 255 + u32 seqno; 256 + /** @number_children: number of children if parent */ 257 + u8 number_children; 258 + /** @child_index: index into child_list if child */ 259 + u8 child_index; 260 + /** @guc: GuC specific members for parallel submission */ 261 + struct { 262 + /** @wqi_head: head pointer in work queue */ 263 + u16 wqi_head; 264 + /** @wqi_tail: tail pointer in work queue */ 265 + u16 wqi_tail; 266 + /** 267 + * @parent_page: page in context state (ce->state) used 268 + * by parent for work queue, process descriptor 269 + */ 270 + u8 parent_page; 271 + } guc; 272 + } parallel; 220 273 221 274 #ifdef CONFIG_DRM_I915_SELFTEST 222 275 /**

+12 -1

drivers/gpu/drm/i915/gt/intel_engine.h

··· 2 2 #ifndef _INTEL_RINGBUFFER_H_ 3 3 #define _INTEL_RINGBUFFER_H_ 4 4 5 + #include <asm/cacheflush.h> 5 6 #include <drm/drm_util.h> 6 7 7 8 #include <linux/hashtable.h> ··· 282 281 return intel_engine_has_preemption(engine); 283 282 } 284 283 284 + #define FORCE_VIRTUAL BIT(0) 285 285 struct intel_context * 286 286 intel_engine_create_virtual(struct intel_engine_cs **siblings, 287 - unsigned int count); 287 + unsigned int count, unsigned long flags); 288 + 289 + static inline struct intel_context * 290 + intel_engine_create_parallel(struct intel_engine_cs **engines, 291 + unsigned int num_engines, 292 + unsigned int width) 293 + { 294 + GEM_BUG_ON(!engines[0]->cops->create_parallel); 295 + return engines[0]->cops->create_parallel(engines, num_engines, width); 296 + } 288 297 289 298 static inline bool 290 299 intel_virtual_engine_has_heartbeat(const struct intel_engine_cs *engine)

+54 -12

drivers/gpu/drm/i915/gt/intel_engine_cs.c

··· 290 290 GEM_DEBUG_WARN_ON(iir); 291 291 } 292 292 293 - static int intel_engine_setup(struct intel_gt *gt, enum intel_engine_id id) 293 + static int intel_engine_setup(struct intel_gt *gt, enum intel_engine_id id, 294 + u8 logical_instance) 294 295 { 295 296 const struct engine_info *info = &intel_engines[id]; 296 297 struct drm_i915_private *i915 = gt->i915; ··· 336 335 337 336 engine->class = info->class; 338 337 engine->instance = info->instance; 338 + engine->logical_mask = BIT(logical_instance); 339 339 __sprint_engine_name(engine); 340 340 341 341 engine->props.heartbeat_interval_ms = ··· 590 588 return info->engine_mask; 591 589 } 592 590 591 + static void populate_logical_ids(struct intel_gt *gt, u8 *logical_ids, 592 + u8 class, const u8 *map, u8 num_instances) 593 + { 594 + int i, j; 595 + u8 current_logical_id = 0; 596 + 597 + for (j = 0; j < num_instances; ++j) { 598 + for (i = 0; i < ARRAY_SIZE(intel_engines); ++i) { 599 + if (!HAS_ENGINE(gt, i) || 600 + intel_engines[i].class != class) 601 + continue; 602 + 603 + if (intel_engines[i].instance == map[j]) { 604 + logical_ids[intel_engines[i].instance] = 605 + current_logical_id++; 606 + break; 607 + } 608 + } 609 + } 610 + } 611 + 612 + static void setup_logical_ids(struct intel_gt *gt, u8 *logical_ids, u8 class) 613 + { 614 + int i; 615 + u8 map[MAX_ENGINE_INSTANCE + 1]; 616 + 617 + for (i = 0; i < MAX_ENGINE_INSTANCE + 1; ++i) 618 + map[i] = i; 619 + populate_logical_ids(gt, logical_ids, class, map, ARRAY_SIZE(map)); 620 + } 621 + 593 622 /** 594 623 * intel_engines_init_mmio() - allocate and prepare the Engine Command Streamers 595 624 * @gt: pointer to struct intel_gt ··· 632 599 struct drm_i915_private *i915 = gt->i915; 633 600 const unsigned int engine_mask = init_engine_mask(gt); 634 601 unsigned int mask = 0; 635 - unsigned int i; 602 + unsigned int i, class; 603 + u8 logical_ids[MAX_ENGINE_INSTANCE + 1]; 636 604 int err; 637 605 638 606 drm_WARN_ON(&i915->drm, engine_mask == 0); ··· 643 609 if (i915_inject_probe_failure(i915)) 644 610 return -ENODEV; 645 611 646 - for (i = 0; i < ARRAY_SIZE(intel_engines); i++) { 647 - if (!HAS_ENGINE(gt, i)) 648 - continue; 612 + for (class = 0; class < MAX_ENGINE_CLASS + 1; ++class) { 613 + setup_logical_ids(gt, logical_ids, class); 649 614 650 - err = intel_engine_setup(gt, i); 651 - if (err) 652 - goto cleanup; 615 + for (i = 0; i < ARRAY_SIZE(intel_engines); ++i) { 616 + u8 instance = intel_engines[i].instance; 653 617 654 - mask |= BIT(i); 618 + if (intel_engines[i].class != class || 619 + !HAS_ENGINE(gt, i)) 620 + continue; 621 + 622 + err = intel_engine_setup(gt, i, 623 + logical_ids[instance]); 624 + if (err) 625 + goto cleanup; 626 + 627 + mask |= BIT(i); 628 + } 655 629 } 656 630 657 631 /* ··· 1953 1911 1954 1912 struct intel_context * 1955 1913 intel_engine_create_virtual(struct intel_engine_cs **siblings, 1956 - unsigned int count) 1914 + unsigned int count, unsigned long flags) 1957 1915 { 1958 1916 if (count == 0) 1959 1917 return ERR_PTR(-EINVAL); 1960 1918 1961 - if (count == 1) 1919 + if (count == 1 && !(flags & FORCE_VIRTUAL)) 1962 1920 return intel_context_create(siblings[0]); 1963 1921 1964 1922 GEM_BUG_ON(!siblings[0]->cops->create_virtual); 1965 - return siblings[0]->cops->create_virtual(siblings, count); 1923 + return siblings[0]->cops->create_virtual(siblings, count, flags); 1966 1924 } 1967 1925 1968 1926 struct i915_request *

+13

drivers/gpu/drm/i915/gt/intel_engine_pm.c

··· 162 162 unsigned long flags; 163 163 bool result = true; 164 164 165 + /* 166 + * This is execlist specific behaviour intended to ensure the GPU is 167 + * idle by switching to a known 'safe' context. With GuC submission, the 168 + * same idle guarantee is achieved by other means (disabling 169 + * scheduling). Further, switching to a 'safe' context has no effect 170 + * with GuC submission as the scheduler can just switch back again. 171 + * 172 + * FIXME: Move this backend scheduler specific behaviour into the 173 + * scheduler backend. 174 + */ 175 + if (intel_engine_uses_guc(engine)) 176 + return true; 177 + 165 178 /* GPU is pointing to the void, as good as in the kernel context. */ 166 179 if (intel_gt_is_wedged(engine->gt)) 167 180 return true;

+37

drivers/gpu/drm/i915/gt/intel_engine_pm.h

··· 6 6 #ifndef INTEL_ENGINE_PM_H 7 7 #define INTEL_ENGINE_PM_H 8 8 9 + #include "i915_drv.h" 9 10 #include "i915_request.h" 10 11 #include "intel_engine_types.h" 11 12 #include "intel_wakeref.h" 13 + #include "intel_gt_pm.h" 12 14 13 15 static inline bool 14 16 intel_engine_pm_is_awake(const struct intel_engine_cs *engine) 15 17 { 16 18 return intel_wakeref_is_active(&engine->wakeref); 19 + } 20 + 21 + static inline void __intel_engine_pm_get(struct intel_engine_cs *engine) 22 + { 23 + __intel_wakeref_get(&engine->wakeref); 17 24 } 18 25 19 26 static inline void intel_engine_pm_get(struct intel_engine_cs *engine) ··· 31 24 static inline bool intel_engine_pm_get_if_awake(struct intel_engine_cs *engine) 32 25 { 33 26 return intel_wakeref_get_if_active(&engine->wakeref); 27 + } 28 + 29 + static inline void intel_engine_pm_might_get(struct intel_engine_cs *engine) 30 + { 31 + if (!intel_engine_is_virtual(engine)) { 32 + intel_wakeref_might_get(&engine->wakeref); 33 + } else { 34 + struct intel_gt *gt = engine->gt; 35 + struct intel_engine_cs *tengine; 36 + intel_engine_mask_t tmp, mask = engine->mask; 37 + 38 + for_each_engine_masked(tengine, gt, mask, tmp) 39 + intel_wakeref_might_get(&tengine->wakeref); 40 + } 41 + intel_gt_pm_might_get(engine->gt); 34 42 } 35 43 36 44 static inline void intel_engine_pm_put(struct intel_engine_cs *engine) ··· 67 45 static inline void intel_engine_pm_flush(struct intel_engine_cs *engine) 68 46 { 69 47 intel_wakeref_unlock_wait(&engine->wakeref); 48 + } 49 + 50 + static inline void intel_engine_pm_might_put(struct intel_engine_cs *engine) 51 + { 52 + if (!intel_engine_is_virtual(engine)) { 53 + intel_wakeref_might_put(&engine->wakeref); 54 + } else { 55 + struct intel_gt *gt = engine->gt; 56 + struct intel_engine_cs *tengine; 57 + intel_engine_mask_t tmp, mask = engine->mask; 58 + 59 + for_each_engine_masked(tengine, gt, mask, tmp) 60 + intel_wakeref_might_put(&tengine->wakeref); 61 + } 62 + intel_gt_pm_might_put(engine->gt); 70 63 } 71 64 72 65 static inline struct i915_request *

+7

drivers/gpu/drm/i915/gt/intel_engine_types.h

··· 269 269 unsigned int guc_id; 270 270 271 271 intel_engine_mask_t mask; 272 + /** 273 + * @logical_mask: logical mask of engine, reported to user space via 274 + * query IOCTL and used to communicate with the GuC in logical space. 275 + * The logical instance of a physical engine can change based on product 276 + * and fusing. 277 + */ 278 + intel_engine_mask_t logical_mask; 272 279 273 280 u8 class; 274 281 u8 instance;

+5 -2

drivers/gpu/drm/i915/gt/intel_execlists_submission.c

··· 201 201 } 202 202 203 203 static struct intel_context * 204 - execlists_create_virtual(struct intel_engine_cs **siblings, unsigned int count); 204 + execlists_create_virtual(struct intel_engine_cs **siblings, unsigned int count, 205 + unsigned long flags); 205 206 206 207 static struct i915_request * 207 208 __active_request(const struct intel_timeline * const tl, ··· 3785 3784 } 3786 3785 3787 3786 static struct intel_context * 3788 - execlists_create_virtual(struct intel_engine_cs **siblings, unsigned int count) 3787 + execlists_create_virtual(struct intel_engine_cs **siblings, unsigned int count, 3788 + unsigned long flags) 3789 3789 { 3790 3790 struct virtual_engine *ve; 3791 3791 unsigned int n; ··· 3879 3877 3880 3878 ve->siblings[ve->num_siblings++] = sibling; 3881 3879 ve->base.mask |= sibling->mask; 3880 + ve->base.logical_mask |= sibling->logical_mask; 3882 3881 3883 3882 /* 3884 3883 * All physical engines must be compatible for their emission

+55

drivers/gpu/drm/i915/gt/intel_gt_debugfs.c

··· 13 13 #include "pxp/intel_pxp_debugfs.h" 14 14 #include "uc/intel_uc_debugfs.h" 15 15 16 + int intel_gt_debugfs_reset_show(struct intel_gt *gt, u64 *val) 17 + { 18 + int ret = intel_gt_terminally_wedged(gt); 19 + 20 + switch (ret) { 21 + case -EIO: 22 + *val = 1; 23 + return 0; 24 + case 0: 25 + *val = 0; 26 + return 0; 27 + default: 28 + return ret; 29 + } 30 + } 31 + 32 + int intel_gt_debugfs_reset_store(struct intel_gt *gt, u64 val) 33 + { 34 + /* Flush any previous reset before applying for a new one */ 35 + wait_event(gt->reset.queue, 36 + !test_bit(I915_RESET_BACKOFF, &gt->reset.flags)); 37 + 38 + intel_gt_handle_error(gt, val, I915_ERROR_CAPTURE, 39 + "Manually reset engine mask to %llx", val); 40 + return 0; 41 + } 42 + 43 + /* 44 + * keep the interface clean where the first parameter 45 + * is a 'struct intel_gt *' instead of 'void *' 46 + */ 47 + static int __intel_gt_debugfs_reset_show(void *data, u64 *val) 48 + { 49 + return intel_gt_debugfs_reset_show(data, val); 50 + } 51 + 52 + static int __intel_gt_debugfs_reset_store(void *data, u64 val) 53 + { 54 + return intel_gt_debugfs_reset_store(data, val); 55 + } 56 + 57 + DEFINE_SIMPLE_ATTRIBUTE(reset_fops, __intel_gt_debugfs_reset_show, 58 + __intel_gt_debugfs_reset_store, "%llu\n"); 59 + 60 + static void gt_debugfs_register(struct intel_gt *gt, struct dentry *root) 61 + { 62 + static const struct intel_gt_debugfs_file files[] = { 63 + { "reset", &reset_fops, NULL }, 64 + }; 65 + 66 + intel_gt_debugfs_register_files(root, files, ARRAY_SIZE(files), gt); 67 + } 68 + 16 69 void intel_gt_debugfs_register(struct intel_gt *gt) 17 70 { 18 71 struct dentry *root; ··· 76 23 root = debugfs_create_dir("gt", gt->i915->drm.primary->debugfs_root); 77 24 if (IS_ERR(root)) 78 25 return; 26 + 27 + gt_debugfs_register(gt, root); 79 28 80 29 intel_gt_engines_debugfs_register(gt, root); 81 30 intel_gt_pm_debugfs_register(gt, root);

+4

drivers/gpu/drm/i915/gt/intel_gt_debugfs.h

··· 35 35 const struct intel_gt_debugfs_file *files, 36 36 unsigned long count, void *data); 37 37 38 + /* functions that need to be accessed by the upper level non-gt interfaces */ 39 + int intel_gt_debugfs_reset_show(struct intel_gt *gt, u64 *val); 40 + int intel_gt_debugfs_reset_store(struct intel_gt *gt, u64 val); 41 + 38 42 #endif /* INTEL_GT_DEBUGFS_H */

+14

drivers/gpu/drm/i915/gt/intel_gt_pm.h

··· 31 31 return intel_wakeref_get_if_active(&gt->wakeref); 32 32 } 33 33 34 + static inline void intel_gt_pm_might_get(struct intel_gt *gt) 35 + { 36 + intel_wakeref_might_get(&gt->wakeref); 37 + } 38 + 34 39 static inline void intel_gt_pm_put(struct intel_gt *gt) 35 40 { 36 41 intel_wakeref_put(&gt->wakeref); ··· 45 40 { 46 41 intel_wakeref_put_async(&gt->wakeref); 47 42 } 43 + 44 + static inline void intel_gt_pm_might_put(struct intel_gt *gt) 45 + { 46 + intel_wakeref_might_put(&gt->wakeref); 47 + } 48 + 49 + #define with_intel_gt_pm(gt, tmp) \ 50 + for (tmp = 1, intel_gt_pm_get(gt); tmp; \ 51 + intel_gt_pm_put(gt), tmp = 0) 48 52 49 53 static inline int intel_gt_pm_wait_for_idle(struct intel_gt *gt) 50 54 {

+41

drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.c

··· 20 20 #include "intel_uncore.h" 21 21 #include "vlv_sideband.h" 22 22 23 + int intel_gt_pm_debugfs_forcewake_user_open(struct intel_gt *gt) 24 + { 25 + atomic_inc(&gt->user_wakeref); 26 + intel_gt_pm_get(gt); 27 + if (GRAPHICS_VER(gt->i915) >= 6) 28 + intel_uncore_forcewake_user_get(gt->uncore); 29 + 30 + return 0; 31 + } 32 + 33 + int intel_gt_pm_debugfs_forcewake_user_release(struct intel_gt *gt) 34 + { 35 + if (GRAPHICS_VER(gt->i915) >= 6) 36 + intel_uncore_forcewake_user_put(gt->uncore); 37 + intel_gt_pm_put(gt); 38 + atomic_dec(&gt->user_wakeref); 39 + 40 + return 0; 41 + } 42 + 43 + static int forcewake_user_open(struct inode *inode, struct file *file) 44 + { 45 + struct intel_gt *gt = inode->i_private; 46 + 47 + return intel_gt_pm_debugfs_forcewake_user_open(gt); 48 + } 49 + 50 + static int forcewake_user_release(struct inode *inode, struct file *file) 51 + { 52 + struct intel_gt *gt = inode->i_private; 53 + 54 + return intel_gt_pm_debugfs_forcewake_user_release(gt); 55 + } 56 + 57 + static const struct file_operations forcewake_user_fops = { 58 + .owner = THIS_MODULE, 59 + .open = forcewake_user_open, 60 + .release = forcewake_user_release, 61 + }; 62 + 23 63 static int fw_domains_show(struct seq_file *m, void *data) 24 64 { 25 65 struct intel_gt *gt = m->private; ··· 668 628 { "drpc", &drpc_fops, NULL }, 669 629 { "frequency", &frequency_fops, NULL }, 670 630 { "forcewake", &fw_domains_fops, NULL }, 631 + { "forcewake_user", &forcewake_user_fops, NULL}, 671 632 { "llc", &llc_fops, llc_eval }, 672 633 { "rps_boost", &rps_boost_fops, rps_eval }, 673 634 };

+4

drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.h

··· 13 13 void intel_gt_pm_debugfs_register(struct intel_gt *gt, struct dentry *root); 14 14 void intel_gt_pm_frequency_dump(struct intel_gt *gt, struct drm_printer *m); 15 15 16 + /* functions that need to be accessed by the upper level non-gt interfaces */ 17 + int intel_gt_pm_debugfs_forcewake_user_open(struct intel_gt *gt); 18 + int intel_gt_pm_debugfs_forcewake_user_release(struct intel_gt *gt); 19 + 16 20 #endif /* INTEL_GT_PM_DEBUGFS_H */

+1

drivers/gpu/drm/i915/gt/intel_llc.c

+5

drivers/gpu/drm/i915/gt/intel_lrc.c

··· 942 942 context_size += PAGE_SIZE; 943 943 } 944 944 945 + if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) { 946 + ce->parallel.guc.parent_page = context_size / PAGE_SIZE; 947 + context_size += PARENT_SCRATCH_SIZE; 948 + } 949 + 945 950 obj = i915_gem_object_create_lmem(engine->i915, context_size, 946 951 I915_BO_ALLOC_PM_VOLATILE); 947 952 if (IS_ERR(obj))

+1 -1

drivers/gpu/drm/i915/gt/intel_ring_submission.c

··· 292 292 sanitize_hwsp(engine); 293 293 294 294 /* And scrub the dirty cachelines for the HWSP */ 295 - clflush_cache_range(engine->status_page.addr, PAGE_SIZE); 295 + drm_clflush_virt_range(engine->status_page.addr, PAGE_SIZE); 296 296 297 297 intel_engine_reset_pinned_contexts(engine); 298 298 }

+2 -2

drivers/gpu/drm/i915/gt/intel_timeline.c

··· 64 64 65 65 timeline->hwsp_map = vaddr; 66 66 timeline->hwsp_seqno = memset(vaddr + ofs, 0, TIMELINE_SEQNO_BYTES); 67 - clflush(vaddr + ofs); 67 + drm_clflush_virt_range(vaddr + ofs, TIMELINE_SEQNO_BYTES); 68 68 69 69 return 0; 70 70 } ··· 225 225 226 226 memset(hwsp_seqno + 1, 0, TIMELINE_SEQNO_BYTES - sizeof(*hwsp_seqno)); 227 227 WRITE_ONCE(*hwsp_seqno, tl->seqno); 228 - clflush(hwsp_seqno); 228 + drm_clflush_virt_range(hwsp_seqno, TIMELINE_SEQNO_BYTES); 229 229 } 230 230 231 231 void intel_timeline_enter(struct intel_timeline *tl)

+6 -6

drivers/gpu/drm/i915/gt/selftest_execlists.c

··· 3733 3733 GEM_BUG_ON(!nctx || nctx > ARRAY_SIZE(ve)); 3734 3734 3735 3735 for (n = 0; n < nctx; n++) { 3736 - ve[n] = intel_engine_create_virtual(siblings, nsibling); 3736 + ve[n] = intel_engine_create_virtual(siblings, nsibling, 0); 3737 3737 if (IS_ERR(ve[n])) { 3738 3738 err = PTR_ERR(ve[n]); 3739 3739 nctx = n; ··· 3929 3929 * restrict it to our desired engine within the virtual engine. 3930 3930 */ 3931 3931 3932 - ve = intel_engine_create_virtual(siblings, nsibling); 3932 + ve = intel_engine_create_virtual(siblings, nsibling, 0); 3933 3933 if (IS_ERR(ve)) { 3934 3934 err = PTR_ERR(ve); 3935 3935 goto out_close; ··· 4060 4060 i915_request_add(rq); 4061 4061 } 4062 4062 4063 - ce = intel_engine_create_virtual(siblings, nsibling); 4063 + ce = intel_engine_create_virtual(siblings, nsibling, 0); 4064 4064 if (IS_ERR(ce)) { 4065 4065 err = PTR_ERR(ce); 4066 4066 goto out; ··· 4112 4112 4113 4113 /* XXX We do not handle oversubscription and fairness with normal rq */ 4114 4114 for (n = 0; n < nsibling; n++) { 4115 - ce = intel_engine_create_virtual(siblings, nsibling); 4115 + ce = intel_engine_create_virtual(siblings, nsibling, 0); 4116 4116 if (IS_ERR(ce)) { 4117 4117 err = PTR_ERR(ce); 4118 4118 goto out; ··· 4214 4214 if (err) 4215 4215 goto out_scratch; 4216 4216 4217 - ve = intel_engine_create_virtual(siblings, nsibling); 4217 + ve = intel_engine_create_virtual(siblings, nsibling, 0); 4218 4218 if (IS_ERR(ve)) { 4219 4219 err = PTR_ERR(ve); 4220 4220 goto out_scratch; ··· 4354 4354 if (igt_spinner_init(&spin, gt)) 4355 4355 return -ENOMEM; 4356 4356 4357 - ve = intel_engine_create_virtual(siblings, nsibling); 4357 + ve = intel_engine_create_virtual(siblings, nsibling, 0); 4358 4358 if (IS_ERR(ve)) { 4359 4359 err = PTR_ERR(ve); 4360 4360 goto out_spin;

+1

drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h

··· 142 142 INTEL_GUC_ACTION_REGISTER_COMMAND_TRANSPORT_BUFFER = 0x4505, 143 143 INTEL_GUC_ACTION_DEREGISTER_COMMAND_TRANSPORT_BUFFER = 0x4506, 144 144 INTEL_GUC_ACTION_DEREGISTER_CONTEXT_DONE = 0x4600, 145 + INTEL_GUC_ACTION_REGISTER_CONTEXT_MULTI_LRC = 0x4601, 145 146 INTEL_GUC_ACTION_RESET_CLIENT = 0x5507, 146 147 INTEL_GUC_ACTION_LIMIT 147 148 };

+29

drivers/gpu/drm/i915/gt/uc/intel_guc.c

··· 756 756 } 757 757 } 758 758 } 759 + 760 + void intel_guc_write_barrier(struct intel_guc *guc) 761 + { 762 + struct intel_gt *gt = guc_to_gt(guc); 763 + 764 + if (i915_gem_object_is_lmem(guc->ct.vma->obj)) { 765 + /* 766 + * Ensure intel_uncore_write_fw can be used rather than 767 + * intel_uncore_write. 768 + */ 769 + GEM_BUG_ON(guc->send_regs.fw_domains); 770 + 771 + /* 772 + * This register is used by the i915 and GuC for MMIO based 773 + * communication. Once we are in this code CTBs are the only 774 + * method the i915 uses to communicate with the GuC so it is 775 + * safe to write to this register (a value of 0 is NOP for MMIO 776 + * communication). If we ever start mixing CTBs and MMIOs a new 777 + * register will have to be chosen. This function is also used 778 + * to enforce ordering of a work queue item write and an update 779 + * to the process descriptor. When a work queue is being used, 780 + * CTBs are also the only mechanism of communication. 781 + */ 782 + intel_uncore_write_fw(gt->uncore, GEN11_SOFT_SCRATCH(0), 0); 783 + } else { 784 + /* wmb() sufficient for a barrier if in smem */ 785 + wmb(); 786 + } 787 + }

+45 -9

drivers/gpu/drm/i915/gt/uc/intel_guc.h

··· 46 46 * submitted until the stalled request is processed. 47 47 */ 48 48 struct i915_request *stalled_request; 49 + /** 50 + * @submission_stall_reason: reason why submission is stalled 51 + */ 52 + enum { 53 + STALL_NONE, 54 + STALL_REGISTER_CONTEXT, 55 + STALL_MOVE_LRC_TAIL, 56 + STALL_ADD_REQUEST, 57 + } submission_stall_reason; 49 58 50 59 /* intel_guc_recv interrupt related state */ 51 60 /** @irq_lock: protects GuC irq state */ ··· 80 71 } interrupts; 81 72 82 73 /** 83 - * @contexts_lock: protects guc_ids, guc_id_list, ce->guc_id.id, and 84 - * ce->guc_id.ref when transitioning in and out of zero 74 + * @submission_state: sub-structure for submission state protected by 75 + * single lock 85 76 */ 86 - spinlock_t contexts_lock; 87 - /** @guc_ids: used to allocate unique ce->guc_id.id values */ 88 - struct ida guc_ids; 89 - /** 90 - * @guc_id_list: list of intel_context with valid guc_ids but no refs 91 - */ 92 - struct list_head guc_id_list; 77 + struct { 78 + /** 79 + * @lock: protects everything in submission_state, 80 + * ce->guc_id.id, and ce->guc_id.ref when transitioning in and 81 + * out of zero 82 + */ 83 + spinlock_t lock; 84 + /** 85 + * @guc_ids: used to allocate new guc_ids, single-lrc 86 + */ 87 + struct ida guc_ids; 88 + /** 89 + * @guc_ids_bitmap: used to allocate new guc_ids, multi-lrc 90 + */ 91 + unsigned long *guc_ids_bitmap; 92 + /** 93 + * @guc_id_list: list of intel_context with valid guc_ids but no 94 + * refs 95 + */ 96 + struct list_head guc_id_list; 97 + /** 98 + * @destroyed_contexts: list of contexts waiting to be destroyed 99 + * (deregistered with the GuC) 100 + */ 101 + struct list_head destroyed_contexts; 102 + /** 103 + * @destroyed_worker: worker to deregister contexts, need as we 104 + * need to take a GT PM reference and can't from destroy 105 + * function as it might be in an atomic context (no sleeping) 106 + */ 107 + struct work_struct destroyed_worker; 108 + } submission_state; 93 109 94 110 /** 95 111 * @submission_supported: tracks whether we support GuC submission on ··· 375 341 void intel_guc_submission_cancel_requests(struct intel_guc *guc); 376 342 377 343 void intel_guc_load_status(struct intel_guc *guc, struct drm_printer *p); 344 + 345 + void intel_guc_write_barrier(struct intel_guc *guc); 378 346 379 347 #endif

+1 -1

drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c

··· 176 176 for_each_engine(engine, gt, id) { 177 177 u8 guc_class = engine_class_to_guc_class(engine->class); 178 178 179 - system_info->mapping_table[guc_class][engine->instance] = 179 + system_info->mapping_table[guc_class][ilog2(engine->logical_mask)] = 180 180 engine->instance; 181 181 } 182 182 }

+1 -23

drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c

··· 383 383 return ++ct->requests.last_fence; 384 384 } 385 385 386 - static void write_barrier(struct intel_guc_ct *ct) 387 - { 388 - struct intel_guc *guc = ct_to_guc(ct); 389 - struct intel_gt *gt = guc_to_gt(guc); 390 - 391 - if (i915_gem_object_is_lmem(guc->ct.vma->obj)) { 392 - GEM_BUG_ON(guc->send_regs.fw_domains); 393 - /* 394 - * This register is used by the i915 and GuC for MMIO based 395 - * communication. Once we are in this code CTBs are the only 396 - * method the i915 uses to communicate with the GuC so it is 397 - * safe to write to this register (a value of 0 is NOP for MMIO 398 - * communication). If we ever start mixing CTBs and MMIOs a new 399 - * register will have to be chosen. 400 - */ 401 - intel_uncore_write_fw(gt->uncore, GEN11_SOFT_SCRATCH(0), 0); 402 - } else { 403 - /* wmb() sufficient for a barrier if in smem */ 404 - wmb(); 405 - } 406 - } 407 - 408 386 static int ct_write(struct intel_guc_ct *ct, 409 387 const u32 *action, 410 388 u32 len /* in dwords */, ··· 452 474 * make sure H2G buffer update and LRC tail update (if this triggering a 453 475 * submission) are visible before updating the descriptor tail 454 476 */ 455 - write_barrier(ct); 477 + intel_guc_write_barrier(ct_to_guc(ct)); 456 478 457 479 /* update local copies */ 458 480 ctb->tail = tail;

+16 -16

drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h

··· 52 52 53 53 #define GUC_DOORBELL_INVALID 256 54 54 55 - #define GUC_WQ_SIZE (PAGE_SIZE * 2) 56 - 57 - /* Work queue item header definitions */ 55 + /* 56 + * Work queue item header definitions 57 + * 58 + * Work queue is circular buffer used to submit complex (multi-lrc) submissions 59 + * to the GuC. A work queue item is an entry in the circular buffer. 60 + */ 58 61 #define WQ_STATUS_ACTIVE 1 59 62 #define WQ_STATUS_SUSPENDED 2 60 63 #define WQ_STATUS_CMD_ERROR 3 61 64 #define WQ_STATUS_ENGINE_ID_NOT_USED 4 62 65 #define WQ_STATUS_SUSPENDED_FROM_RESET 5 63 - #define WQ_TYPE_SHIFT 0 64 - #define WQ_TYPE_BATCH_BUF (0x1 << WQ_TYPE_SHIFT) 65 - #define WQ_TYPE_PSEUDO (0x2 << WQ_TYPE_SHIFT) 66 - #define WQ_TYPE_INORDER (0x3 << WQ_TYPE_SHIFT) 67 - #define WQ_TYPE_NOOP (0x4 << WQ_TYPE_SHIFT) 68 - #define WQ_TARGET_SHIFT 10 69 - #define WQ_LEN_SHIFT 16 70 - #define WQ_NO_WCFLUSH_WAIT (1 << 27) 71 - #define WQ_PRESENT_WORKLOAD (1 << 28) 66 + #define WQ_TYPE_BATCH_BUF 0x1 67 + #define WQ_TYPE_PSEUDO 0x2 68 + #define WQ_TYPE_INORDER 0x3 69 + #define WQ_TYPE_NOOP 0x4 70 + #define WQ_TYPE_MULTI_LRC 0x5 71 + #define WQ_TYPE_MASK GENMASK(7, 0) 72 + #define WQ_LEN_MASK GENMASK(26, 16) 72 73 73 - #define WQ_RING_TAIL_SHIFT 20 74 - #define WQ_RING_TAIL_MAX 0x7FF /* 2^11 QWords */ 75 - #define WQ_RING_TAIL_MASK (WQ_RING_TAIL_MAX << WQ_RING_TAIL_SHIFT) 74 + #define WQ_GUC_ID_MASK GENMASK(15, 0) 75 + #define WQ_RING_TAIL_MASK GENMASK(28, 18) 76 76 77 77 #define GUC_STAGE_DESC_ATTR_ACTIVE BIT(0) 78 78 #define GUC_STAGE_DESC_ATTR_PENDING_DB BIT(1) ··· 186 186 u32 wq_status; 187 187 u32 engine_presence; 188 188 u32 priority; 189 - u32 reserved[30]; 189 + u32 reserved[36]; 190 190 } __packed; 191 191 192 192 #define CONTEXT_REGISTRATION_FLAG_KMD BIT(0)

+1261 -189

drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c

··· 11 11 #include "gt/intel_context.h" 12 12 #include "gt/intel_engine_pm.h" 13 13 #include "gt/intel_engine_heartbeat.h" 14 + #include "gt/intel_gpu_commands.h" 14 15 #include "gt/intel_gt.h" 15 16 #include "gt/intel_gt_irq.h" 16 17 #include "gt/intel_gt_pm.h" ··· 69 68 * fence is used to stall all requests associated with this guc_id until the 70 69 * corresponding G2H returns indicating the guc_id has been deregistered. 71 70 * 72 - * guc_ids: 71 + * submission_state.guc_ids: 73 72 * Unique number associated with private GuC context data passed in during 74 73 * context registration / submission / deregistration. 64k available. Simple ida 75 74 * is used for allocation. ··· 90 89 * sched_engine can be submitting at a time. Currently only one sched_engine is 91 90 * used for all of GuC submission but that could change in the future. 92 91 * 93 - * guc->contexts_lock 94 - * Protects guc_id allocation for the given GuC, i.e. only one context can be 95 - * doing guc_id allocation operations at a time for each GuC in the system. 92 + * guc->submission_state.lock 93 + * Global lock for GuC submission state. Protects guc_ids and destroyed contexts 94 + * list. 96 95 * 97 96 * ce->guc_state.lock 98 97 * Protects everything under ce->guc_state. Ensures that a context is in the ··· 104 103 * 105 104 * Lock ordering rules: 106 105 * sched_engine->lock -> ce->guc_state.lock 107 - * guc->contexts_lock -> ce->guc_state.lock 106 + * guc->submission_state.lock -> ce->guc_state.lock 108 107 * 109 108 * Reset races: 110 109 * When a full GT reset is triggered it is assumed that some G2H responses to ··· 125 124 }; 126 125 127 126 static struct intel_context * 128 - guc_create_virtual(struct intel_engine_cs **siblings, unsigned int count); 127 + guc_create_virtual(struct intel_engine_cs **siblings, unsigned int count, 128 + unsigned long flags); 129 + 130 + static struct intel_context * 131 + guc_create_parallel(struct intel_engine_cs **engines, 132 + unsigned int num_siblings, 133 + unsigned int width); 129 134 130 135 #define GUC_REQUEST_SIZE 64 /* bytes */ 136 + 137 + /* 138 + * We reserve 1/16 of the guc_ids for multi-lrc as these need to be contiguous 139 + * per the GuC submission interface. A different allocation algorithm is used 140 + * (bitmap vs. ida) between multi-lrc and single-lrc hence the reason to 141 + * partition the guc_id space. We believe the number of multi-lrc contexts in 142 + * use should be low and 1/16 should be sufficient. Minimum of 32 guc_ids for 143 + * multi-lrc. 144 + */ 145 + #define NUMBER_MULTI_LRC_GUC_ID (GUC_MAX_LRC_DESCRIPTORS / 16) 131 146 132 147 /* 133 148 * Below is a set of functions which control the GuC scheduling state which ··· 341 324 GEM_BUG_ON(ce->guc_state.number_committed_requests < 0); 342 325 } 343 326 327 + static struct intel_context * 328 + request_to_scheduling_context(struct i915_request *rq) 329 + { 330 + return intel_context_to_parent(rq->context); 331 + } 332 + 344 333 static inline bool context_guc_id_invalid(struct intel_context *ce) 345 334 { 346 335 return ce->guc_id.id == GUC_INVALID_LRC_ID; ··· 365 342 static inline struct i915_priolist *to_priolist(struct rb_node *rb) 366 343 { 367 344 return rb_entry(rb, struct i915_priolist, node); 345 + } 346 + 347 + /* 348 + * When using multi-lrc submission a scratch memory area is reserved in the 349 + * parent's context state for the process descriptor, work queue, and handshake 350 + * between the parent + children contexts to insert safe preemption points 351 + * between each of the BBs. Currently the scratch area is sized to a page. 352 + * 353 + * The layout of this scratch area is below: 354 + * 0 guc_process_desc 355 + * + sizeof(struct guc_process_desc) child go 356 + * + CACHELINE_BYTES child join[0] 357 + * ... 358 + * + CACHELINE_BYTES child join[n - 1] 359 + * ... unused 360 + * PARENT_SCRATCH_SIZE / 2 work queue start 361 + * ... work queue 362 + * PARENT_SCRATCH_SIZE - 1 work queue end 363 + */ 364 + #define WQ_SIZE (PARENT_SCRATCH_SIZE / 2) 365 + #define WQ_OFFSET (PARENT_SCRATCH_SIZE - WQ_SIZE) 366 + 367 + struct sync_semaphore { 368 + u32 semaphore; 369 + u8 unused[CACHELINE_BYTES - sizeof(u32)]; 370 + }; 371 + 372 + struct parent_scratch { 373 + struct guc_process_desc pdesc; 374 + 375 + struct sync_semaphore go; 376 + struct sync_semaphore join[MAX_ENGINE_INSTANCE + 1]; 377 + 378 + u8 unused[WQ_OFFSET - sizeof(struct guc_process_desc) - 379 + sizeof(struct sync_semaphore) * (MAX_ENGINE_INSTANCE + 2)]; 380 + 381 + u32 wq[WQ_SIZE / sizeof(u32)]; 382 + }; 383 + 384 + static u32 __get_parent_scratch_offset(struct intel_context *ce) 385 + { 386 + GEM_BUG_ON(!ce->parallel.guc.parent_page); 387 + 388 + return ce->parallel.guc.parent_page * PAGE_SIZE; 389 + } 390 + 391 + static u32 __get_wq_offset(struct intel_context *ce) 392 + { 393 + BUILD_BUG_ON(offsetof(struct parent_scratch, wq) != WQ_OFFSET); 394 + 395 + return __get_parent_scratch_offset(ce) + WQ_OFFSET; 396 + } 397 + 398 + static struct parent_scratch * 399 + __get_parent_scratch(struct intel_context *ce) 400 + { 401 + BUILD_BUG_ON(sizeof(struct parent_scratch) != PARENT_SCRATCH_SIZE); 402 + BUILD_BUG_ON(sizeof(struct sync_semaphore) != CACHELINE_BYTES); 403 + 404 + /* 405 + * Need to subtract LRC_STATE_OFFSET here as the 406 + * parallel.guc.parent_page is the offset into ce->state while 407 + * ce->lrc_reg_reg is ce->state + LRC_STATE_OFFSET. 408 + */ 409 + return (struct parent_scratch *) 410 + (ce->lrc_reg_state + 411 + ((__get_parent_scratch_offset(ce) - 412 + LRC_STATE_OFFSET) / sizeof(u32))); 413 + } 414 + 415 + static struct guc_process_desc * 416 + __get_process_desc(struct intel_context *ce) 417 + { 418 + struct parent_scratch *ps = __get_parent_scratch(ce); 419 + 420 + return &ps->pdesc; 421 + } 422 + 423 + static u32 *get_wq_pointer(struct guc_process_desc *desc, 424 + struct intel_context *ce, 425 + u32 wqi_size) 426 + { 427 + /* 428 + * Check for space in work queue. Caching a value of head pointer in 429 + * intel_context structure in order reduce the number accesses to shared 430 + * GPU memory which may be across a PCIe bus. 431 + */ 432 + #define AVAILABLE_SPACE \ 433 + CIRC_SPACE(ce->parallel.guc.wqi_tail, ce->parallel.guc.wqi_head, WQ_SIZE) 434 + if (wqi_size > AVAILABLE_SPACE) { 435 + ce->parallel.guc.wqi_head = READ_ONCE(desc->head); 436 + 437 + if (wqi_size > AVAILABLE_SPACE) 438 + return NULL; 439 + } 440 + #undef AVAILABLE_SPACE 441 + 442 + return &__get_parent_scratch(ce)->wq[ce->parallel.guc.wqi_tail / sizeof(u32)]; 368 443 } 369 444 370 445 static struct guc_lrc_desc *__get_lrc_desc(struct intel_guc *guc, u32 index) ··· 624 503 625 504 static int guc_lrc_desc_pin(struct intel_context *ce, bool loop); 626 505 627 - static int guc_add_request(struct intel_guc *guc, struct i915_request *rq) 506 + static int __guc_add_request(struct intel_guc *guc, struct i915_request *rq) 628 507 { 629 508 int err = 0; 630 - struct intel_context *ce = rq->context; 509 + struct intel_context *ce = request_to_scheduling_context(rq); 631 510 u32 action[3]; 632 511 int len = 0; 633 512 u32 g2h_len_dw = 0; ··· 648 527 GEM_BUG_ON(!atomic_read(&ce->guc_id.ref)); 649 528 GEM_BUG_ON(context_guc_id_invalid(ce)); 650 529 651 - /* 652 - * Corner case where the GuC firmware was blown away and reloaded while 653 - * this context was pinned. 654 - */ 655 - if (unlikely(!lrc_desc_registered(guc, ce->guc_id.id))) { 656 - err = guc_lrc_desc_pin(ce, false); 657 - if (unlikely(err)) 658 - return err; 659 - } 660 - 661 530 spin_lock(&ce->guc_state.lock); 662 531 663 532 /* 664 533 * The request / context will be run on the hardware when scheduling 665 - * gets enabled in the unblock. 534 + * gets enabled in the unblock. For multi-lrc we still submit the 535 + * context to move the LRC tails. 666 536 */ 667 - if (unlikely(context_blocked(ce))) 537 + if (unlikely(context_blocked(ce) && !intel_context_is_parent(ce))) 668 538 goto out; 669 539 670 - enabled = context_enabled(ce); 540 + enabled = context_enabled(ce) || context_blocked(ce); 671 541 672 542 if (!enabled) { 673 543 action[len++] = INTEL_GUC_ACTION_SCHED_CONTEXT_MODE_SET; ··· 677 565 trace_intel_context_sched_enable(ce); 678 566 atomic_inc(&guc->outstanding_submission_g2h); 679 567 set_context_enabled(ce); 568 + 569 + /* 570 + * Without multi-lrc KMD does the submission step (moving the 571 + * lrc tail) so enabling scheduling is sufficient to submit the 572 + * context. This isn't the case in multi-lrc submission as the 573 + * GuC needs to move the tails, hence the need for another H2G 574 + * to submit a multi-lrc context after enabling scheduling. 575 + */ 576 + if (intel_context_is_parent(ce)) { 577 + action[0] = INTEL_GUC_ACTION_SCHED_CONTEXT; 578 + err = intel_guc_send_nb(guc, action, len - 1, 0); 579 + } 680 580 } else if (!enabled) { 681 581 clr_context_pending_enable(ce); 682 582 intel_context_put(ce); ··· 701 577 return err; 702 578 } 703 579 580 + static int guc_add_request(struct intel_guc *guc, struct i915_request *rq) 581 + { 582 + int ret = __guc_add_request(guc, rq); 583 + 584 + if (unlikely(ret == -EBUSY)) { 585 + guc->stalled_request = rq; 586 + guc->submission_stall_reason = STALL_ADD_REQUEST; 587 + } 588 + 589 + return ret; 590 + } 591 + 704 592 static inline void guc_set_lrc_tail(struct i915_request *rq) 705 593 { 706 594 rq->context->lrc_reg_state[CTX_RING_TAIL] = ··· 722 586 static inline int rq_prio(const struct i915_request *rq) 723 587 { 724 588 return rq->sched.attr.priority; 589 + } 590 + 591 + static bool is_multi_lrc_rq(struct i915_request *rq) 592 + { 593 + return intel_context_is_parallel(rq->context); 594 + } 595 + 596 + static bool can_merge_rq(struct i915_request *rq, 597 + struct i915_request *last) 598 + { 599 + return request_to_scheduling_context(rq) == 600 + request_to_scheduling_context(last); 601 + } 602 + 603 + static u32 wq_space_until_wrap(struct intel_context *ce) 604 + { 605 + return (WQ_SIZE - ce->parallel.guc.wqi_tail); 606 + } 607 + 608 + static void write_wqi(struct guc_process_desc *desc, 609 + struct intel_context *ce, 610 + u32 wqi_size) 611 + { 612 + BUILD_BUG_ON(!is_power_of_2(WQ_SIZE)); 613 + 614 + /* 615 + * Ensure WQI are visible before updating tail 616 + */ 617 + intel_guc_write_barrier(ce_to_guc(ce)); 618 + 619 + ce->parallel.guc.wqi_tail = (ce->parallel.guc.wqi_tail + wqi_size) & 620 + (WQ_SIZE - 1); 621 + WRITE_ONCE(desc->tail, ce->parallel.guc.wqi_tail); 622 + } 623 + 624 + static int guc_wq_noop_append(struct intel_context *ce) 625 + { 626 + struct guc_process_desc *desc = __get_process_desc(ce); 627 + u32 *wqi = get_wq_pointer(desc, ce, wq_space_until_wrap(ce)); 628 + u32 len_dw = wq_space_until_wrap(ce) / sizeof(u32) - 1; 629 + 630 + if (!wqi) 631 + return -EBUSY; 632 + 633 + GEM_BUG_ON(!FIELD_FIT(WQ_LEN_MASK, len_dw)); 634 + 635 + *wqi = FIELD_PREP(WQ_TYPE_MASK, WQ_TYPE_NOOP) | 636 + FIELD_PREP(WQ_LEN_MASK, len_dw); 637 + ce->parallel.guc.wqi_tail = 0; 638 + 639 + return 0; 640 + } 641 + 642 + static int __guc_wq_item_append(struct i915_request *rq) 643 + { 644 + struct intel_context *ce = request_to_scheduling_context(rq); 645 + struct intel_context *child; 646 + struct guc_process_desc *desc = __get_process_desc(ce); 647 + unsigned int wqi_size = (ce->parallel.number_children + 4) * 648 + sizeof(u32); 649 + u32 *wqi; 650 + u32 len_dw = (wqi_size / sizeof(u32)) - 1; 651 + int ret; 652 + 653 + /* Ensure context is in correct state updating work queue */ 654 + GEM_BUG_ON(!atomic_read(&ce->guc_id.ref)); 655 + GEM_BUG_ON(context_guc_id_invalid(ce)); 656 + GEM_BUG_ON(context_wait_for_deregister_to_register(ce)); 657 + GEM_BUG_ON(!lrc_desc_registered(ce_to_guc(ce), ce->guc_id.id)); 658 + 659 + /* Insert NOOP if this work queue item will wrap the tail pointer. */ 660 + if (wqi_size > wq_space_until_wrap(ce)) { 661 + ret = guc_wq_noop_append(ce); 662 + if (ret) 663 + return ret; 664 + } 665 + 666 + wqi = get_wq_pointer(desc, ce, wqi_size); 667 + if (!wqi) 668 + return -EBUSY; 669 + 670 + GEM_BUG_ON(!FIELD_FIT(WQ_LEN_MASK, len_dw)); 671 + 672 + *wqi++ = FIELD_PREP(WQ_TYPE_MASK, WQ_TYPE_MULTI_LRC) | 673 + FIELD_PREP(WQ_LEN_MASK, len_dw); 674 + *wqi++ = ce->lrc.lrca; 675 + *wqi++ = FIELD_PREP(WQ_GUC_ID_MASK, ce->guc_id.id) | 676 + FIELD_PREP(WQ_RING_TAIL_MASK, ce->ring->tail / sizeof(u64)); 677 + *wqi++ = 0; /* fence_id */ 678 + for_each_child(ce, child) 679 + *wqi++ = child->ring->tail / sizeof(u64); 680 + 681 + write_wqi(desc, ce, wqi_size); 682 + 683 + return 0; 684 + } 685 + 686 + static int guc_wq_item_append(struct intel_guc *guc, 687 + struct i915_request *rq) 688 + { 689 + struct intel_context *ce = request_to_scheduling_context(rq); 690 + int ret = 0; 691 + 692 + if (likely(!intel_context_is_banned(ce))) { 693 + ret = __guc_wq_item_append(rq); 694 + 695 + if (unlikely(ret == -EBUSY)) { 696 + guc->stalled_request = rq; 697 + guc->submission_stall_reason = STALL_MOVE_LRC_TAIL; 698 + } 699 + } 700 + 701 + return ret; 702 + } 703 + 704 + static bool multi_lrc_submit(struct i915_request *rq) 705 + { 706 + struct intel_context *ce = request_to_scheduling_context(rq); 707 + 708 + intel_ring_set_tail(rq->ring, rq->tail); 709 + 710 + /* 711 + * We expect the front end (execbuf IOCTL) to set this flag on the last 712 + * request generated from a multi-BB submission. This indicates to the 713 + * backend (GuC interface) that we should submit this context thus 714 + * submitting all the requests generated in parallel. 715 + */ 716 + return test_bit(I915_FENCE_FLAG_SUBMIT_PARALLEL, &rq->fence.flags) || 717 + intel_context_is_banned(ce); 725 718 } 726 719 727 720 static int guc_dequeue_one_context(struct intel_guc *guc) ··· 866 601 if (guc->stalled_request) { 867 602 submit = true; 868 603 last = guc->stalled_request; 869 - goto resubmit; 604 + 605 + switch (guc->submission_stall_reason) { 606 + case STALL_REGISTER_CONTEXT: 607 + goto register_context; 608 + case STALL_MOVE_LRC_TAIL: 609 + goto move_lrc_tail; 610 + case STALL_ADD_REQUEST: 611 + goto add_request; 612 + default: 613 + MISSING_CASE(guc->submission_stall_reason); 614 + } 870 615 } 871 616 872 617 while ((rb = rb_first_cached(&sched_engine->queue))) { ··· 884 609 struct i915_request *rq, *rn; 885 610 886 611 priolist_for_each_request_consume(rq, rn, p) { 887 - if (last && rq->context != last->context) 888 - goto done; 612 + if (last && !can_merge_rq(rq, last)) 613 + goto register_context; 889 614 890 615 list_del_init(&rq->sched.link); 891 616 ··· 893 618 894 619 trace_i915_request_in(rq, 0); 895 620 last = rq; 896 - submit = true; 621 + 622 + if (is_multi_lrc_rq(rq)) { 623 + /* 624 + * We need to coalesce all multi-lrc requests in 625 + * a relationship into a single H2G. We are 626 + * guaranteed that all of these requests will be 627 + * submitted sequentially. 628 + */ 629 + if (multi_lrc_submit(rq)) { 630 + submit = true; 631 + goto register_context; 632 + } 633 + } else { 634 + submit = true; 635 + } 897 636 } 898 637 899 638 rb_erase_cached(&p->node, &sched_engine->queue); 900 639 i915_priolist_free(p); 901 640 } 902 - done: 641 + 642 + register_context: 903 643 if (submit) { 904 - guc_set_lrc_tail(last); 905 - resubmit: 644 + struct intel_context *ce = request_to_scheduling_context(last); 645 + 646 + if (unlikely(!lrc_desc_registered(guc, ce->guc_id.id) && 647 + !intel_context_is_banned(ce))) { 648 + ret = guc_lrc_desc_pin(ce, false); 649 + if (unlikely(ret == -EPIPE)) { 650 + goto deadlk; 651 + } else if (ret == -EBUSY) { 652 + guc->stalled_request = last; 653 + guc->submission_stall_reason = 654 + STALL_REGISTER_CONTEXT; 655 + goto schedule_tasklet; 656 + } else if (ret != 0) { 657 + GEM_WARN_ON(ret); /* Unexpected */ 658 + goto deadlk; 659 + } 660 + } 661 + 662 + move_lrc_tail: 663 + if (is_multi_lrc_rq(last)) { 664 + ret = guc_wq_item_append(guc, last); 665 + if (ret == -EBUSY) { 666 + goto schedule_tasklet; 667 + } else if (ret != 0) { 668 + GEM_WARN_ON(ret); /* Unexpected */ 669 + goto deadlk; 670 + } 671 + } else { 672 + guc_set_lrc_tail(last); 673 + } 674 + 675 + add_request: 906 676 ret = guc_add_request(guc, last); 907 - if (unlikely(ret == -EPIPE)) 677 + if (unlikely(ret == -EPIPE)) { 908 678 goto deadlk; 909 - else if (ret == -EBUSY) { 910 - tasklet_schedule(&sched_engine->tasklet); 911 - guc->stalled_request = last; 912 - return false; 679 + } else if (ret == -EBUSY) { 680 + goto schedule_tasklet; 681 + } else if (ret != 0) { 682 + GEM_WARN_ON(ret); /* Unexpected */ 683 + goto deadlk; 913 684 } 914 685 } 915 686 916 687 guc->stalled_request = NULL; 688 + guc->submission_stall_reason = STALL_NONE; 917 689 return submit; 918 690 919 691 deadlk: 920 692 sched_engine->tasklet.callback = NULL; 921 693 tasklet_disable_nosync(&sched_engine->tasklet); 694 + return false; 695 + 696 + schedule_tasklet: 697 + tasklet_schedule(&sched_engine->tasklet); 922 698 return false; 923 699 } 924 700 ··· 1045 719 if (deregister) 1046 720 guc_signal_context_fence(ce); 1047 721 if (destroyed) { 722 + intel_gt_pm_put_async(guc_to_gt(guc)); 1048 723 release_guc_id(guc, ce); 1049 724 __guc_context_destroy(ce); 1050 725 } ··· 1124 797 spin_unlock_irqrestore(&sched_engine->lock, flags); 1125 798 } 1126 799 800 + static void guc_flush_destroyed_contexts(struct intel_guc *guc); 801 + 1127 802 void intel_guc_submission_reset_prepare(struct intel_guc *guc) 1128 803 { 1129 804 int i; ··· 1144 815 spin_unlock_irq(&guc_to_gt(guc)->irq_lock); 1145 816 1146 817 guc_flush_submissions(guc); 818 + guc_flush_destroyed_contexts(guc); 1147 819 1148 820 /* 1149 821 * Handle any outstanding G2Hs before reset. Call IRQ handler directly ··· 1259 929 1260 930 static void __guc_reset_context(struct intel_context *ce, bool stalled) 1261 931 { 932 + bool local_stalled; 1262 933 struct i915_request *rq; 1263 934 unsigned long flags; 1264 935 u32 head; 936 + int i, number_children = ce->parallel.number_children; 1265 937 bool skip = false; 938 + struct intel_context *parent = ce; 939 + 940 + GEM_BUG_ON(intel_context_is_child(ce)); 1266 941 1267 942 intel_context_get(ce); 1268 943 ··· 1293 958 if (unlikely(skip)) 1294 959 goto out_put; 1295 960 1296 - rq = intel_context_find_active_request(ce); 1297 - if (!rq) { 1298 - head = ce->ring->tail; 1299 - stalled = false; 1300 - goto out_replay; 961 + /* 962 + * For each context in the relationship find the hanging request 963 + * resetting each context / request as needed 964 + */ 965 + for (i = 0; i < number_children + 1; ++i) { 966 + if (!intel_context_is_pinned(ce)) 967 + goto next_context; 968 + 969 + local_stalled = false; 970 + rq = intel_context_find_active_request(ce); 971 + if (!rq) { 972 + head = ce->ring->tail; 973 + goto out_replay; 974 + } 975 + 976 + if (i915_request_started(rq)) 977 + local_stalled = true; 978 + 979 + GEM_BUG_ON(i915_active_is_idle(&ce->active)); 980 + head = intel_ring_wrap(ce->ring, rq->head); 981 + 982 + __i915_request_reset(rq, local_stalled && stalled); 983 + out_replay: 984 + guc_reset_state(ce, head, local_stalled && stalled); 985 + next_context: 986 + if (i != number_children) 987 + ce = list_next_entry(ce, parallel.child_link); 1301 988 } 1302 989 1303 - if (!i915_request_started(rq)) 1304 - stalled = false; 1305 - 1306 - GEM_BUG_ON(i915_active_is_idle(&ce->active)); 1307 - head = intel_ring_wrap(ce->ring, rq->head); 1308 - __i915_request_reset(rq, stalled); 1309 - 1310 - out_replay: 1311 - guc_reset_state(ce, head, stalled); 1312 - __unwind_incomplete_requests(ce); 990 + __unwind_incomplete_requests(parent); 1313 991 out_put: 1314 - intel_context_put(ce); 992 + intel_context_put(parent); 1315 993 } 1316 994 1317 995 void intel_guc_submission_reset(struct intel_guc *guc, bool stalled) ··· 1345 997 1346 998 xa_unlock(&guc->context_lookup); 1347 999 1348 - if (intel_context_is_pinned(ce)) 1000 + if (intel_context_is_pinned(ce) && 1001 + !intel_context_is_child(ce)) 1349 1002 __guc_reset_context(ce, stalled); 1350 1003 1351 1004 intel_context_put(ce); ··· 1438 1089 1439 1090 xa_unlock(&guc->context_lookup); 1440 1091 1441 - if (intel_context_is_pinned(ce)) 1092 + if (intel_context_is_pinned(ce) && 1093 + !intel_context_is_child(ce)) 1442 1094 guc_cancel_context_requests(ce); 1443 1095 1444 1096 intel_context_put(ce); ··· 1476 1126 intel_gt_unpark_heartbeats(guc_to_gt(guc)); 1477 1127 } 1478 1128 1129 + static void destroyed_worker_func(struct work_struct *w); 1130 + 1479 1131 /* 1480 1132 * Set up the memory resources to be shared with the GuC (via the GGTT) 1481 1133 * at firmware loading time. ··· 1500 1148 1501 1149 xa_init_flags(&guc->context_lookup, XA_FLAGS_LOCK_IRQ); 1502 1150 1503 - spin_lock_init(&guc->contexts_lock); 1504 - INIT_LIST_HEAD(&guc->guc_id_list); 1505 - ida_init(&guc->guc_ids); 1151 + spin_lock_init(&guc->submission_state.lock); 1152 + INIT_LIST_HEAD(&guc->submission_state.guc_id_list); 1153 + ida_init(&guc->submission_state.guc_ids); 1154 + INIT_LIST_HEAD(&guc->submission_state.destroyed_contexts); 1155 + INIT_WORK(&guc->submission_state.destroyed_worker, 1156 + destroyed_worker_func); 1157 + 1158 + guc->submission_state.guc_ids_bitmap = 1159 + bitmap_zalloc(NUMBER_MULTI_LRC_GUC_ID, GFP_KERNEL); 1160 + if (!guc->submission_state.guc_ids_bitmap) 1161 + return -ENOMEM; 1506 1162 1507 1163 return 0; 1508 1164 } ··· 1520 1160 if (!guc->lrc_desc_pool) 1521 1161 return; 1522 1162 1163 + guc_flush_destroyed_contexts(guc); 1523 1164 guc_lrc_desc_pool_destroy(guc); 1524 1165 i915_sched_engine_put(guc->sched_engine); 1166 + bitmap_free(guc->submission_state.guc_ids_bitmap); 1525 1167 } 1526 1168 1527 1169 static inline void queue_request(struct i915_sched_engine *sched_engine, ··· 1540 1178 static int guc_bypass_tasklet_submit(struct intel_guc *guc, 1541 1179 struct i915_request *rq) 1542 1180 { 1543 - int ret; 1181 + int ret = 0; 1544 1182 1545 1183 __i915_request_submit(rq); 1546 1184 1547 1185 trace_i915_request_in(rq, 0); 1548 1186 1549 - guc_set_lrc_tail(rq); 1550 - ret = guc_add_request(guc, rq); 1551 - if (ret == -EBUSY) 1552 - guc->stalled_request = rq; 1187 + if (is_multi_lrc_rq(rq)) { 1188 + if (multi_lrc_submit(rq)) { 1189 + ret = guc_wq_item_append(guc, rq); 1190 + if (!ret) 1191 + ret = guc_add_request(guc, rq); 1192 + } 1193 + } else { 1194 + guc_set_lrc_tail(rq); 1195 + ret = guc_add_request(guc, rq); 1196 + } 1553 1197 1554 1198 if (unlikely(ret == -EPIPE)) 1555 1199 disable_submission(guc); 1556 1200 1557 1201 return ret; 1202 + } 1203 + 1204 + static bool need_tasklet(struct intel_guc *guc, struct i915_request *rq) 1205 + { 1206 + struct i915_sched_engine *sched_engine = rq->engine->sched_engine; 1207 + struct intel_context *ce = request_to_scheduling_context(rq); 1208 + 1209 + return submission_disabled(guc) || guc->stalled_request || 1210 + !i915_sched_engine_is_empty(sched_engine) || 1211 + !lrc_desc_registered(guc, ce->guc_id.id); 1558 1212 } 1559 1213 1560 1214 static void guc_submit_request(struct i915_request *rq) ··· 1582 1204 /* Will be called from irq-context when using foreign fences. */ 1583 1205 spin_lock_irqsave(&sched_engine->lock, flags); 1584 1206 1585 - if (submission_disabled(guc) || guc->stalled_request || 1586 - !i915_sched_engine_is_empty(sched_engine)) 1207 + if (need_tasklet(guc, rq)) 1587 1208 queue_request(sched_engine, rq, rq_prio(rq)); 1588 1209 else if (guc_bypass_tasklet_submit(guc, rq) == -EBUSY) 1589 1210 tasklet_hi_schedule(&sched_engine->tasklet); ··· 1590 1213 spin_unlock_irqrestore(&sched_engine->lock, flags); 1591 1214 } 1592 1215 1593 - static int new_guc_id(struct intel_guc *guc) 1216 + static int new_guc_id(struct intel_guc *guc, struct intel_context *ce) 1594 1217 { 1595 - return ida_simple_get(&guc->guc_ids, 0, 1596 - GUC_MAX_LRC_DESCRIPTORS, GFP_KERNEL | 1597 - __GFP_RETRY_MAYFAIL | __GFP_NOWARN); 1218 + int ret; 1219 + 1220 + GEM_BUG_ON(intel_context_is_child(ce)); 1221 + 1222 + if (intel_context_is_parent(ce)) 1223 + ret = bitmap_find_free_region(guc->submission_state.guc_ids_bitmap, 1224 + NUMBER_MULTI_LRC_GUC_ID, 1225 + order_base_2(ce->parallel.number_children 1226 + + 1)); 1227 + else 1228 + ret = ida_simple_get(&guc->submission_state.guc_ids, 1229 + NUMBER_MULTI_LRC_GUC_ID, 1230 + GUC_MAX_LRC_DESCRIPTORS, 1231 + GFP_KERNEL | __GFP_RETRY_MAYFAIL | 1232 + __GFP_NOWARN); 1233 + if (unlikely(ret < 0)) 1234 + return ret; 1235 + 1236 + ce->guc_id.id = ret; 1237 + return 0; 1598 1238 } 1599 1239 1600 1240 static void __release_guc_id(struct intel_guc *guc, struct intel_context *ce) 1601 1241 { 1242 + GEM_BUG_ON(intel_context_is_child(ce)); 1243 + 1602 1244 if (!context_guc_id_invalid(ce)) { 1603 - ida_simple_remove(&guc->guc_ids, ce->guc_id.id); 1245 + if (intel_context_is_parent(ce)) 1246 + bitmap_release_region(guc->submission_state.guc_ids_bitmap, 1247 + ce->guc_id.id, 1248 + order_base_2(ce->parallel.number_children 1249 + + 1)); 1250 + else 1251 + ida_simple_remove(&guc->submission_state.guc_ids, 1252 + ce->guc_id.id); 1604 1253 reset_lrc_desc(guc, ce->guc_id.id); 1605 1254 set_context_guc_id_invalid(ce); 1606 1255 } ··· 1638 1235 { 1639 1236 unsigned long flags; 1640 1237 1641 - spin_lock_irqsave(&guc->contexts_lock, flags); 1238 + spin_lock_irqsave(&guc->submission_state.lock, flags); 1642 1239 __release_guc_id(guc, ce); 1643 - spin_unlock_irqrestore(&guc->contexts_lock, flags); 1240 + spin_unlock_irqrestore(&guc->submission_state.lock, flags); 1644 1241 } 1645 1242 1646 - static int steal_guc_id(struct intel_guc *guc) 1243 + static int steal_guc_id(struct intel_guc *guc, struct intel_context *ce) 1647 1244 { 1648 - struct intel_context *ce; 1649 - int guc_id; 1245 + struct intel_context *cn; 1650 1246 1651 - lockdep_assert_held(&guc->contexts_lock); 1247 + lockdep_assert_held(&guc->submission_state.lock); 1248 + GEM_BUG_ON(intel_context_is_child(ce)); 1249 + GEM_BUG_ON(intel_context_is_parent(ce)); 1652 1250 1653 - if (!list_empty(&guc->guc_id_list)) { 1654 - ce = list_first_entry(&guc->guc_id_list, 1251 + if (!list_empty(&guc->submission_state.guc_id_list)) { 1252 + cn = list_first_entry(&guc->submission_state.guc_id_list, 1655 1253 struct intel_context, 1656 1254 guc_id.link); 1657 1255 1658 - GEM_BUG_ON(atomic_read(&ce->guc_id.ref)); 1659 - GEM_BUG_ON(context_guc_id_invalid(ce)); 1256 + GEM_BUG_ON(atomic_read(&cn->guc_id.ref)); 1257 + GEM_BUG_ON(context_guc_id_invalid(cn)); 1258 + GEM_BUG_ON(intel_context_is_child(cn)); 1259 + GEM_BUG_ON(intel_context_is_parent(cn)); 1660 1260 1661 - list_del_init(&ce->guc_id.link); 1662 - guc_id = ce->guc_id.id; 1261 + list_del_init(&cn->guc_id.link); 1262 + ce->guc_id = cn->guc_id; 1663 1263 1664 1264 spin_lock(&ce->guc_state.lock); 1665 - clr_context_registered(ce); 1265 + clr_context_registered(cn); 1666 1266 spin_unlock(&ce->guc_state.lock); 1667 1267 1668 - set_context_guc_id_invalid(ce); 1669 - return guc_id; 1268 + set_context_guc_id_invalid(cn); 1269 + 1270 + return 0; 1670 1271 } else { 1671 1272 return -EAGAIN; 1672 1273 } 1673 1274 } 1674 1275 1675 - static int assign_guc_id(struct intel_guc *guc, u16 *out) 1276 + static int assign_guc_id(struct intel_guc *guc, struct intel_context *ce) 1676 1277 { 1677 1278 int ret; 1678 1279 1679 - lockdep_assert_held(&guc->contexts_lock); 1280 + lockdep_assert_held(&guc->submission_state.lock); 1281 + GEM_BUG_ON(intel_context_is_child(ce)); 1680 1282 1681 - ret = new_guc_id(guc); 1283 + ret = new_guc_id(guc, ce); 1682 1284 if (unlikely(ret < 0)) { 1683 - ret = steal_guc_id(guc); 1285 + if (intel_context_is_parent(ce)) 1286 + return -ENOSPC; 1287 + 1288 + ret = steal_guc_id(guc, ce); 1684 1289 if (ret < 0) 1685 1290 return ret; 1686 1291 } 1687 1292 1688 - *out = ret; 1293 + if (intel_context_is_parent(ce)) { 1294 + struct intel_context *child; 1295 + int i = 1; 1296 + 1297 + for_each_child(ce, child) 1298 + child->guc_id.id = ce->guc_id.id + i++; 1299 + } 1300 + 1689 1301 return 0; 1690 1302 } 1691 1303 ··· 1713 1295 GEM_BUG_ON(atomic_read(&ce->guc_id.ref)); 1714 1296 1715 1297 try_again: 1716 - spin_lock_irqsave(&guc->contexts_lock, flags); 1298 + spin_lock_irqsave(&guc->submission_state.lock, flags); 1717 1299 1718 1300 might_lock(&ce->guc_state.lock); 1719 1301 1720 1302 if (context_guc_id_invalid(ce)) { 1721 - ret = assign_guc_id(guc, &ce->guc_id.id); 1303 + ret = assign_guc_id(guc, ce); 1722 1304 if (ret) 1723 1305 goto out_unlock; 1724 1306 ret = 1; /* Indidcates newly assigned guc_id */ ··· 1728 1310 atomic_inc(&ce->guc_id.ref); 1729 1311 1730 1312 out_unlock: 1731 - spin_unlock_irqrestore(&guc->contexts_lock, flags); 1313 + spin_unlock_irqrestore(&guc->submission_state.lock, flags); 1732 1314 1733 1315 /* 1734 1316 * -EAGAIN indicates no guc_id are available, let's retire any ··· 1760 1342 unsigned long flags; 1761 1343 1762 1344 GEM_BUG_ON(atomic_read(&ce->guc_id.ref) < 0); 1345 + GEM_BUG_ON(intel_context_is_child(ce)); 1763 1346 1764 - if (unlikely(context_guc_id_invalid(ce))) 1347 + if (unlikely(context_guc_id_invalid(ce) || 1348 + intel_context_is_parent(ce))) 1765 1349 return; 1766 1350 1767 - spin_lock_irqsave(&guc->contexts_lock, flags); 1351 + spin_lock_irqsave(&guc->submission_state.lock, flags); 1768 1352 if (!context_guc_id_invalid(ce) && list_empty(&ce->guc_id.link) && 1769 1353 !atomic_read(&ce->guc_id.ref)) 1770 - list_add_tail(&ce->guc_id.link, &guc->guc_id_list); 1771 - spin_unlock_irqrestore(&guc->contexts_lock, flags); 1354 + list_add_tail(&ce->guc_id.link, 1355 + &guc->submission_state.guc_id_list); 1356 + spin_unlock_irqrestore(&guc->submission_state.lock, flags); 1357 + } 1358 + 1359 + static int __guc_action_register_multi_lrc(struct intel_guc *guc, 1360 + struct intel_context *ce, 1361 + u32 guc_id, 1362 + u32 offset, 1363 + bool loop) 1364 + { 1365 + struct intel_context *child; 1366 + u32 action[4 + MAX_ENGINE_INSTANCE]; 1367 + int len = 0; 1368 + 1369 + GEM_BUG_ON(ce->parallel.number_children > MAX_ENGINE_INSTANCE); 1370 + 1371 + action[len++] = INTEL_GUC_ACTION_REGISTER_CONTEXT_MULTI_LRC; 1372 + action[len++] = guc_id; 1373 + action[len++] = ce->parallel.number_children + 1; 1374 + action[len++] = offset; 1375 + for_each_child(ce, child) { 1376 + offset += sizeof(struct guc_lrc_desc); 1377 + action[len++] = offset; 1378 + } 1379 + 1380 + return guc_submission_send_busy_loop(guc, action, len, 0, loop); 1772 1381 } 1773 1382 1774 1383 static int __guc_action_register_context(struct intel_guc *guc, ··· 1820 1375 ce->guc_id.id * sizeof(struct guc_lrc_desc); 1821 1376 int ret; 1822 1377 1378 + GEM_BUG_ON(intel_context_is_child(ce)); 1823 1379 trace_intel_context_register(ce); 1824 1380 1825 - ret = __guc_action_register_context(guc, ce->guc_id.id, offset, loop); 1381 + if (intel_context_is_parent(ce)) 1382 + ret = __guc_action_register_multi_lrc(guc, ce, ce->guc_id.id, 1383 + offset, loop); 1384 + else 1385 + ret = __guc_action_register_context(guc, ce->guc_id.id, offset, 1386 + loop); 1826 1387 if (likely(!ret)) { 1827 1388 unsigned long flags; 1828 1389 ··· 1857 1406 { 1858 1407 struct intel_guc *guc = ce_to_guc(ce); 1859 1408 1409 + GEM_BUG_ON(intel_context_is_child(ce)); 1860 1410 trace_intel_context_deregister(ce); 1861 1411 1862 1412 return __guc_action_deregister_context(guc, guc_id); 1863 1413 } 1864 1414 1865 - static intel_engine_mask_t adjust_engine_mask(u8 class, intel_engine_mask_t mask) 1415 + static inline void clear_children_join_go_memory(struct intel_context *ce) 1866 1416 { 1867 - switch (class) { 1868 - case RENDER_CLASS: 1869 - return mask >> RCS0; 1870 - case VIDEO_ENHANCEMENT_CLASS: 1871 - return mask >> VECS0; 1872 - case VIDEO_DECODE_CLASS: 1873 - return mask >> VCS0; 1874 - case COPY_ENGINE_CLASS: 1875 - return mask >> BCS0; 1876 - default: 1877 - MISSING_CASE(class); 1878 - return 0; 1879 - } 1417 + struct parent_scratch *ps = __get_parent_scratch(ce); 1418 + int i; 1419 + 1420 + ps->go.semaphore = 0; 1421 + for (i = 0; i < ce->parallel.number_children + 1; ++i) 1422 + ps->join[i].semaphore = 0; 1423 + } 1424 + 1425 + static inline u32 get_children_go_value(struct intel_context *ce) 1426 + { 1427 + return __get_parent_scratch(ce)->go.semaphore; 1428 + } 1429 + 1430 + static inline u32 get_children_join_value(struct intel_context *ce, 1431 + u8 child_index) 1432 + { 1433 + return __get_parent_scratch(ce)->join[child_index].semaphore; 1880 1434 } 1881 1435 1882 1436 static void guc_context_policy_init(struct intel_engine_cs *engine, ··· 1906 1450 struct guc_lrc_desc *desc; 1907 1451 bool context_registered; 1908 1452 intel_wakeref_t wakeref; 1453 + struct intel_context *child; 1909 1454 int ret = 0; 1910 1455 1911 1456 GEM_BUG_ON(!engine->mask); ··· 1926 1469 1927 1470 desc = __get_lrc_desc(guc, desc_idx); 1928 1471 desc->engine_class = engine_class_to_guc_class(engine->class); 1929 - desc->engine_submit_mask = adjust_engine_mask(engine->class, 1930 - engine->mask); 1472 + desc->engine_submit_mask = engine->logical_mask; 1931 1473 desc->hw_context_desc = ce->lrc.lrca; 1932 1474 desc->priority = ce->guc_state.prio; 1933 1475 desc->context_flags = CONTEXT_REGISTRATION_FLAG_KMD; 1934 1476 guc_context_policy_init(engine, desc); 1477 + 1478 + /* 1479 + * If context is a parent, we need to register a process descriptor 1480 + * describing a work queue and register all child contexts. 1481 + */ 1482 + if (intel_context_is_parent(ce)) { 1483 + struct guc_process_desc *pdesc; 1484 + 1485 + ce->parallel.guc.wqi_tail = 0; 1486 + ce->parallel.guc.wqi_head = 0; 1487 + 1488 + desc->process_desc = i915_ggtt_offset(ce->state) + 1489 + __get_parent_scratch_offset(ce); 1490 + desc->wq_addr = i915_ggtt_offset(ce->state) + 1491 + __get_wq_offset(ce); 1492 + desc->wq_size = WQ_SIZE; 1493 + 1494 + pdesc = __get_process_desc(ce); 1495 + memset(pdesc, 0, sizeof(*(pdesc))); 1496 + pdesc->stage_id = ce->guc_id.id; 1497 + pdesc->wq_base_addr = desc->wq_addr; 1498 + pdesc->wq_size_bytes = desc->wq_size; 1499 + pdesc->wq_status = WQ_STATUS_ACTIVE; 1500 + 1501 + for_each_child(ce, child) { 1502 + desc = __get_lrc_desc(guc, child->guc_id.id); 1503 + 1504 + desc->engine_class = 1505 + engine_class_to_guc_class(engine->class); 1506 + desc->hw_context_desc = child->lrc.lrca; 1507 + desc->priority = ce->guc_state.prio; 1508 + desc->context_flags = CONTEXT_REGISTRATION_FLAG_KMD; 1509 + guc_context_policy_init(engine, desc); 1510 + } 1511 + 1512 + clear_children_join_go_memory(ce); 1513 + } 1935 1514 1936 1515 /* 1937 1516 * The context_lookup xarray is used to determine if the hardware ··· 2052 1559 2053 1560 static int guc_context_pin(struct intel_context *ce, void *vaddr) 2054 1561 { 2055 - return __guc_context_pin(ce, ce->engine, vaddr); 1562 + int ret = __guc_context_pin(ce, ce->engine, vaddr); 1563 + 1564 + if (likely(!ret && !intel_context_is_barrier(ce))) 1565 + intel_engine_pm_get(ce->engine); 1566 + 1567 + return ret; 2056 1568 } 2057 1569 2058 1570 static void guc_context_unpin(struct intel_context *ce) ··· 2066 1568 2067 1569 unpin_guc_id(guc, ce); 2068 1570 lrc_unpin(ce); 1571 + 1572 + if (likely(!intel_context_is_barrier(ce))) 1573 + intel_engine_pm_put_async(ce->engine); 2069 1574 } 2070 1575 2071 1576 static void guc_context_post_unpin(struct intel_context *ce) ··· 2103 1602 2104 1603 GEM_BUG_ON(guc_id == GUC_INVALID_LRC_ID); 2105 1604 1605 + GEM_BUG_ON(intel_context_is_child(ce)); 2106 1606 trace_intel_context_sched_disable(ce); 2107 1607 2108 1608 guc_submission_send_busy_loop(guc, action, ARRAY_SIZE(action), ··· 2154 1652 intel_wakeref_t wakeref; 2155 1653 u16 guc_id; 2156 1654 bool enabled; 1655 + 1656 + GEM_BUG_ON(intel_context_is_child(ce)); 2157 1657 2158 1658 spin_lock_irqsave(&ce->guc_state.lock, flags); 2159 1659 ··· 2211 1707 bool enable; 2212 1708 2213 1709 GEM_BUG_ON(context_enabled(ce)); 1710 + GEM_BUG_ON(intel_context_is_child(ce)); 2214 1711 2215 1712 spin_lock_irqsave(&ce->guc_state.lock, flags); 2216 1713 ··· 2238 1733 static void guc_context_cancel_request(struct intel_context *ce, 2239 1734 struct i915_request *rq) 2240 1735 { 1736 + struct intel_context *block_context = 1737 + request_to_scheduling_context(rq); 1738 + 2241 1739 if (i915_sw_fence_signaled(&rq->submit)) { 2242 1740 struct i915_sw_fence *fence; 2243 1741 2244 1742 intel_context_get(ce); 2245 - fence = guc_context_block(ce); 1743 + fence = guc_context_block(block_context); 2246 1744 i915_sw_fence_wait(fence); 2247 1745 if (!i915_request_completed(rq)) { 2248 1746 __i915_request_skip(rq); ··· 2259 1751 */ 2260 1752 flush_work(&ce_to_guc(ce)->ct.requests.worker); 2261 1753 2262 - guc_context_unblock(ce); 1754 + guc_context_unblock(block_context); 2263 1755 intel_context_put(ce); 2264 1756 } 2265 1757 } ··· 2284 1776 &ce->engine->gt->i915->runtime_pm; 2285 1777 intel_wakeref_t wakeref; 2286 1778 unsigned long flags; 1779 + 1780 + GEM_BUG_ON(intel_context_is_child(ce)); 2287 1781 2288 1782 guc_flush_submissions(guc); 2289 1783 ··· 2337 1827 intel_wakeref_t wakeref; 2338 1828 u16 guc_id; 2339 1829 1830 + GEM_BUG_ON(intel_context_is_child(ce)); 1831 + 2340 1832 spin_lock_irqsave(&ce->guc_state.lock, flags); 2341 1833 2342 1834 /* ··· 2369 1857 static inline void guc_lrc_desc_unpin(struct intel_context *ce) 2370 1858 { 2371 1859 struct intel_guc *guc = ce_to_guc(ce); 1860 + struct intel_gt *gt = guc_to_gt(guc); 1861 + unsigned long flags; 1862 + bool disabled; 2372 1863 1864 + GEM_BUG_ON(!intel_gt_pm_is_awake(gt)); 2373 1865 GEM_BUG_ON(!lrc_desc_registered(guc, ce->guc_id.id)); 2374 1866 GEM_BUG_ON(ce != __get_context(guc, ce->guc_id.id)); 2375 1867 GEM_BUG_ON(context_enabled(ce)); 1868 + 1869 + /* Seal race with Reset */ 1870 + spin_lock_irqsave(&ce->guc_state.lock, flags); 1871 + disabled = submission_disabled(guc); 1872 + if (likely(!disabled)) { 1873 + __intel_gt_pm_get(gt); 1874 + set_context_destroyed(ce); 1875 + clr_context_registered(ce); 1876 + } 1877 + spin_unlock_irqrestore(&ce->guc_state.lock, flags); 1878 + if (unlikely(disabled)) { 1879 + release_guc_id(guc, ce); 1880 + __guc_context_destroy(ce); 1881 + return; 1882 + } 2376 1883 2377 1884 deregister_context(ce, ce->guc_id.id); 2378 1885 } ··· 2420 1889 } 2421 1890 } 2422 1891 1892 + static void guc_flush_destroyed_contexts(struct intel_guc *guc) 1893 + { 1894 + struct intel_context *ce, *cn; 1895 + unsigned long flags; 1896 + 1897 + GEM_BUG_ON(!submission_disabled(guc) && 1898 + guc_submission_initialized(guc)); 1899 + 1900 + spin_lock_irqsave(&guc->submission_state.lock, flags); 1901 + list_for_each_entry_safe(ce, cn, 1902 + &guc->submission_state.destroyed_contexts, 1903 + destroyed_link) { 1904 + list_del_init(&ce->destroyed_link); 1905 + __release_guc_id(guc, ce); 1906 + __guc_context_destroy(ce); 1907 + } 1908 + spin_unlock_irqrestore(&guc->submission_state.lock, flags); 1909 + } 1910 + 1911 + static void deregister_destroyed_contexts(struct intel_guc *guc) 1912 + { 1913 + struct intel_context *ce, *cn; 1914 + unsigned long flags; 1915 + 1916 + spin_lock_irqsave(&guc->submission_state.lock, flags); 1917 + list_for_each_entry_safe(ce, cn, 1918 + &guc->submission_state.destroyed_contexts, 1919 + destroyed_link) { 1920 + list_del_init(&ce->destroyed_link); 1921 + guc_lrc_desc_unpin(ce); 1922 + } 1923 + spin_unlock_irqrestore(&guc->submission_state.lock, flags); 1924 + } 1925 + 1926 + static void destroyed_worker_func(struct work_struct *w) 1927 + { 1928 + struct intel_guc *guc = container_of(w, struct intel_guc, 1929 + submission_state.destroyed_worker); 1930 + struct intel_gt *gt = guc_to_gt(guc); 1931 + int tmp; 1932 + 1933 + with_intel_gt_pm(gt, tmp) 1934 + deregister_destroyed_contexts(guc); 1935 + } 1936 + 2423 1937 static void guc_context_destroy(struct kref *kref) 2424 1938 { 2425 1939 struct intel_context *ce = container_of(kref, typeof(*ce), ref); 2426 - struct intel_runtime_pm *runtime_pm = ce->engine->uncore->rpm; 2427 1940 struct intel_guc *guc = ce_to_guc(ce); 2428 - intel_wakeref_t wakeref; 2429 1941 unsigned long flags; 2430 - bool disabled; 1942 + bool destroy; 2431 1943 2432 1944 /* 2433 1945 * If the guc_id is invalid this context has been stolen and we can free 2434 1946 * it immediately. Also can be freed immediately if the context is not 2435 1947 * registered with the GuC or the GuC is in the middle of a reset. 2436 1948 */ 2437 - if (context_guc_id_invalid(ce)) { 2438 - __guc_context_destroy(ce); 2439 - return; 2440 - } else if (submission_disabled(guc) || 2441 - !lrc_desc_registered(guc, ce->guc_id.id)) { 2442 - release_guc_id(guc, ce); 2443 - __guc_context_destroy(ce); 2444 - return; 1949 + spin_lock_irqsave(&guc->submission_state.lock, flags); 1950 + destroy = submission_disabled(guc) || context_guc_id_invalid(ce) || 1951 + !lrc_desc_registered(guc, ce->guc_id.id); 1952 + if (likely(!destroy)) { 1953 + if (!list_empty(&ce->guc_id.link)) 1954 + list_del_init(&ce->guc_id.link); 1955 + list_add_tail(&ce->destroyed_link, 1956 + &guc->submission_state.destroyed_contexts); 1957 + } else { 1958 + __release_guc_id(guc, ce); 2445 1959 } 2446 - 2447 - /* 2448 - * We have to acquire the context spinlock and check guc_id again, if it 2449 - * is valid it hasn't been stolen and needs to be deregistered. We 2450 - * delete this context from the list of unpinned guc_id available to 2451 - * steal to seal a race with guc_lrc_desc_pin(). When the G2H CTB 2452 - * returns indicating this context has been deregistered the guc_id is 2453 - * returned to the pool of available guc_id. 2454 - */ 2455 - spin_lock_irqsave(&guc->contexts_lock, flags); 2456 - if (context_guc_id_invalid(ce)) { 2457 - spin_unlock_irqrestore(&guc->contexts_lock, flags); 2458 - __guc_context_destroy(ce); 2459 - return; 2460 - } 2461 - 2462 - if (!list_empty(&ce->guc_id.link)) 2463 - list_del_init(&ce->guc_id.link); 2464 - spin_unlock_irqrestore(&guc->contexts_lock, flags); 2465 - 2466 - /* Seal race with Reset */ 2467 - spin_lock_irqsave(&ce->guc_state.lock, flags); 2468 - disabled = submission_disabled(guc); 2469 - if (likely(!disabled)) { 2470 - set_context_destroyed(ce); 2471 - clr_context_registered(ce); 2472 - } 2473 - spin_unlock_irqrestore(&ce->guc_state.lock, flags); 2474 - if (unlikely(disabled)) { 2475 - release_guc_id(guc, ce); 1960 + spin_unlock_irqrestore(&guc->submission_state.lock, flags); 1961 + if (unlikely(destroy)) { 2476 1962 __guc_context_destroy(ce); 2477 1963 return; 2478 1964 } 2479 1965 2480 1966 /* 2481 - * We defer GuC context deregistration until the context is destroyed 2482 - * in order to save on CTBs. With this optimization ideally we only need 2483 - * 1 CTB to register the context during the first pin and 1 CTB to 2484 - * deregister the context when the context is destroyed. Without this 2485 - * optimization, a CTB would be needed every pin & unpin. 2486 - * 2487 - * XXX: Need to acqiure the runtime wakeref as this can be triggered 2488 - * from context_free_worker when runtime wakeref is not held. 2489 - * guc_lrc_desc_unpin requires the runtime as a GuC register is written 2490 - * in H2G CTB to deregister the context. A future patch may defer this 2491 - * H2G CTB if the runtime wakeref is zero. 1967 + * We use a worker to issue the H2G to deregister the context as we can 1968 + * take the GT PM for the first time which isn't allowed from an atomic 1969 + * context. 2492 1970 */ 2493 - with_intel_runtime_pm(runtime_pm, wakeref) 2494 - guc_lrc_desc_unpin(ce); 1971 + queue_work(system_unbound_wq, &guc->submission_state.destroyed_worker); 2495 1972 } 2496 1973 2497 1974 static int guc_context_alloc(struct intel_context *ce) ··· 2595 2056 2596 2057 static void add_to_context(struct i915_request *rq) 2597 2058 { 2598 - struct intel_context *ce = rq->context; 2059 + struct intel_context *ce = request_to_scheduling_context(rq); 2599 2060 u8 new_guc_prio = map_i915_prio_to_guc_prio(rq_prio(rq)); 2600 2061 2062 + GEM_BUG_ON(intel_context_is_child(ce)); 2601 2063 GEM_BUG_ON(rq->guc_prio == GUC_PRIO_FINI); 2602 2064 2603 2065 spin_lock(&ce->guc_state.lock); ··· 2631 2091 2632 2092 static void remove_from_context(struct i915_request *rq) 2633 2093 { 2634 - struct intel_context *ce = rq->context; 2094 + struct intel_context *ce = request_to_scheduling_context(rq); 2095 + 2096 + GEM_BUG_ON(intel_context_is_child(ce)); 2635 2097 2636 2098 spin_lock_irq(&ce->guc_state.lock); 2637 2099 ··· 2674 2132 .destroy = guc_context_destroy, 2675 2133 2676 2134 .create_virtual = guc_create_virtual, 2135 + .create_parallel = guc_create_parallel, 2677 2136 }; 2678 2137 2679 2138 static void submit_work_cb(struct irq_work *wrk) ··· 2711 2168 { 2712 2169 unsigned long flags; 2713 2170 2171 + GEM_BUG_ON(intel_context_is_child(ce)); 2172 + 2714 2173 spin_lock_irqsave(&ce->guc_state.lock, flags); 2715 2174 clr_context_wait_for_deregister_to_register(ce); 2716 2175 __guc_signal_context_fence(ce); ··· 2743 2198 2744 2199 static int guc_request_alloc(struct i915_request *rq) 2745 2200 { 2746 - struct intel_context *ce = rq->context; 2201 + struct intel_context *ce = request_to_scheduling_context(rq); 2747 2202 struct intel_guc *guc = ce_to_guc(ce); 2748 2203 unsigned long flags; 2749 2204 int ret; ··· 2847 2302 static int guc_virtual_context_pin(struct intel_context *ce, void *vaddr) 2848 2303 { 2849 2304 struct intel_engine_cs *engine = guc_virtual_get_sibling(ce->engine, 0); 2305 + int ret = __guc_context_pin(ce, engine, vaddr); 2306 + intel_engine_mask_t tmp, mask = ce->engine->mask; 2850 2307 2851 - return __guc_context_pin(ce, engine, vaddr); 2308 + if (likely(!ret)) 2309 + for_each_engine_masked(engine, ce->engine->gt, mask, tmp) 2310 + intel_engine_pm_get(engine); 2311 + 2312 + return ret; 2313 + } 2314 + 2315 + static void guc_virtual_context_unpin(struct intel_context *ce) 2316 + { 2317 + intel_engine_mask_t tmp, mask = ce->engine->mask; 2318 + struct intel_engine_cs *engine; 2319 + struct intel_guc *guc = ce_to_guc(ce); 2320 + 2321 + GEM_BUG_ON(context_enabled(ce)); 2322 + GEM_BUG_ON(intel_context_is_barrier(ce)); 2323 + 2324 + unpin_guc_id(guc, ce); 2325 + lrc_unpin(ce); 2326 + 2327 + for_each_engine_masked(engine, ce->engine->gt, mask, tmp) 2328 + intel_engine_pm_put_async(engine); 2852 2329 } 2853 2330 2854 2331 static void guc_virtual_context_enter(struct intel_context *ce) ··· 2907 2340 2908 2341 .pre_pin = guc_virtual_context_pre_pin, 2909 2342 .pin = guc_virtual_context_pin, 2910 - .unpin = guc_context_unpin, 2343 + .unpin = guc_virtual_context_unpin, 2911 2344 .post_unpin = guc_context_post_unpin, 2912 2345 2913 2346 .ban = guc_context_ban, ··· 2923 2356 2924 2357 .get_sibling = guc_virtual_get_sibling, 2925 2358 }; 2359 + 2360 + static int guc_parent_context_pin(struct intel_context *ce, void *vaddr) 2361 + { 2362 + struct intel_engine_cs *engine = guc_virtual_get_sibling(ce->engine, 0); 2363 + struct intel_guc *guc = ce_to_guc(ce); 2364 + int ret; 2365 + 2366 + GEM_BUG_ON(!intel_context_is_parent(ce)); 2367 + GEM_BUG_ON(!intel_engine_is_virtual(ce->engine)); 2368 + 2369 + ret = pin_guc_id(guc, ce); 2370 + if (unlikely(ret < 0)) 2371 + return ret; 2372 + 2373 + return __guc_context_pin(ce, engine, vaddr); 2374 + } 2375 + 2376 + static int guc_child_context_pin(struct intel_context *ce, void *vaddr) 2377 + { 2378 + struct intel_engine_cs *engine = guc_virtual_get_sibling(ce->engine, 0); 2379 + 2380 + GEM_BUG_ON(!intel_context_is_child(ce)); 2381 + GEM_BUG_ON(!intel_engine_is_virtual(ce->engine)); 2382 + 2383 + __intel_context_pin(ce->parallel.parent); 2384 + return __guc_context_pin(ce, engine, vaddr); 2385 + } 2386 + 2387 + static void guc_parent_context_unpin(struct intel_context *ce) 2388 + { 2389 + struct intel_guc *guc = ce_to_guc(ce); 2390 + 2391 + GEM_BUG_ON(context_enabled(ce)); 2392 + GEM_BUG_ON(intel_context_is_barrier(ce)); 2393 + GEM_BUG_ON(!intel_context_is_parent(ce)); 2394 + GEM_BUG_ON(!intel_engine_is_virtual(ce->engine)); 2395 + 2396 + if (ce->parallel.last_rq) 2397 + i915_request_put(ce->parallel.last_rq); 2398 + unpin_guc_id(guc, ce); 2399 + lrc_unpin(ce); 2400 + } 2401 + 2402 + static void guc_child_context_unpin(struct intel_context *ce) 2403 + { 2404 + GEM_BUG_ON(context_enabled(ce)); 2405 + GEM_BUG_ON(intel_context_is_barrier(ce)); 2406 + GEM_BUG_ON(!intel_context_is_child(ce)); 2407 + GEM_BUG_ON(!intel_engine_is_virtual(ce->engine)); 2408 + 2409 + lrc_unpin(ce); 2410 + } 2411 + 2412 + static void guc_child_context_post_unpin(struct intel_context *ce) 2413 + { 2414 + GEM_BUG_ON(!intel_context_is_child(ce)); 2415 + GEM_BUG_ON(!intel_context_is_pinned(ce->parallel.parent)); 2416 + GEM_BUG_ON(!intel_engine_is_virtual(ce->engine)); 2417 + 2418 + lrc_post_unpin(ce); 2419 + intel_context_unpin(ce->parallel.parent); 2420 + } 2421 + 2422 + static void guc_child_context_destroy(struct kref *kref) 2423 + { 2424 + struct intel_context *ce = container_of(kref, typeof(*ce), ref); 2425 + 2426 + __guc_context_destroy(ce); 2427 + } 2428 + 2429 + static const struct intel_context_ops virtual_parent_context_ops = { 2430 + .alloc = guc_virtual_context_alloc, 2431 + 2432 + .pre_pin = guc_context_pre_pin, 2433 + .pin = guc_parent_context_pin, 2434 + .unpin = guc_parent_context_unpin, 2435 + .post_unpin = guc_context_post_unpin, 2436 + 2437 + .ban = guc_context_ban, 2438 + 2439 + .cancel_request = guc_context_cancel_request, 2440 + 2441 + .enter = guc_virtual_context_enter, 2442 + .exit = guc_virtual_context_exit, 2443 + 2444 + .sched_disable = guc_context_sched_disable, 2445 + 2446 + .destroy = guc_context_destroy, 2447 + 2448 + .get_sibling = guc_virtual_get_sibling, 2449 + }; 2450 + 2451 + static const struct intel_context_ops virtual_child_context_ops = { 2452 + .alloc = guc_virtual_context_alloc, 2453 + 2454 + .pre_pin = guc_context_pre_pin, 2455 + .pin = guc_child_context_pin, 2456 + .unpin = guc_child_context_unpin, 2457 + .post_unpin = guc_child_context_post_unpin, 2458 + 2459 + .cancel_request = guc_context_cancel_request, 2460 + 2461 + .enter = guc_virtual_context_enter, 2462 + .exit = guc_virtual_context_exit, 2463 + 2464 + .destroy = guc_child_context_destroy, 2465 + 2466 + .get_sibling = guc_virtual_get_sibling, 2467 + }; 2468 + 2469 + /* 2470 + * The below override of the breadcrumbs is enabled when the user configures a 2471 + * context for parallel submission (multi-lrc, parent-child). 2472 + * 2473 + * The overridden breadcrumbs implements an algorithm which allows the GuC to 2474 + * safely preempt all the hw contexts configured for parallel submission 2475 + * between each BB. The contract between the i915 and GuC is if the parent 2476 + * context can be preempted, all the children can be preempted, and the GuC will 2477 + * always try to preempt the parent before the children. A handshake between the 2478 + * parent / children breadcrumbs ensures the i915 holds up its end of the deal 2479 + * creating a window to preempt between each set of BBs. 2480 + */ 2481 + static int emit_bb_start_parent_no_preempt_mid_batch(struct i915_request *rq, 2482 + u64 offset, u32 len, 2483 + const unsigned int flags); 2484 + static int emit_bb_start_child_no_preempt_mid_batch(struct i915_request *rq, 2485 + u64 offset, u32 len, 2486 + const unsigned int flags); 2487 + static u32 * 2488 + emit_fini_breadcrumb_parent_no_preempt_mid_batch(struct i915_request *rq, 2489 + u32 *cs); 2490 + static u32 * 2491 + emit_fini_breadcrumb_child_no_preempt_mid_batch(struct i915_request *rq, 2492 + u32 *cs); 2493 + 2494 + static struct intel_context * 2495 + guc_create_parallel(struct intel_engine_cs **engines, 2496 + unsigned int num_siblings, 2497 + unsigned int width) 2498 + { 2499 + struct intel_engine_cs **siblings = NULL; 2500 + struct intel_context *parent = NULL, *ce, *err; 2501 + int i, j; 2502 + 2503 + siblings = kmalloc_array(num_siblings, 2504 + sizeof(*siblings), 2505 + GFP_KERNEL); 2506 + if (!siblings) 2507 + return ERR_PTR(-ENOMEM); 2508 + 2509 + for (i = 0; i < width; ++i) { 2510 + for (j = 0; j < num_siblings; ++j) 2511 + siblings[j] = engines[i * num_siblings + j]; 2512 + 2513 + ce = intel_engine_create_virtual(siblings, num_siblings, 2514 + FORCE_VIRTUAL); 2515 + if (!ce) { 2516 + err = ERR_PTR(-ENOMEM); 2517 + goto unwind; 2518 + } 2519 + 2520 + if (i == 0) { 2521 + parent = ce; 2522 + parent->ops = &virtual_parent_context_ops; 2523 + } else { 2524 + ce->ops = &virtual_child_context_ops; 2525 + intel_context_bind_parent_child(parent, ce); 2526 + } 2527 + } 2528 + 2529 + parent->parallel.fence_context = dma_fence_context_alloc(1); 2530 + 2531 + parent->engine->emit_bb_start = 2532 + emit_bb_start_parent_no_preempt_mid_batch; 2533 + parent->engine->emit_fini_breadcrumb = 2534 + emit_fini_breadcrumb_parent_no_preempt_mid_batch; 2535 + parent->engine->emit_fini_breadcrumb_dw = 2536 + 12 + 4 * parent->parallel.number_children; 2537 + for_each_child(parent, ce) { 2538 + ce->engine->emit_bb_start = 2539 + emit_bb_start_child_no_preempt_mid_batch; 2540 + ce->engine->emit_fini_breadcrumb = 2541 + emit_fini_breadcrumb_child_no_preempt_mid_batch; 2542 + ce->engine->emit_fini_breadcrumb_dw = 16; 2543 + } 2544 + 2545 + kfree(siblings); 2546 + return parent; 2547 + 2548 + unwind: 2549 + if (parent) 2550 + intel_context_put(parent); 2551 + kfree(siblings); 2552 + return err; 2553 + } 2926 2554 2927 2555 static bool 2928 2556 guc_irq_enable_breadcrumbs(struct intel_breadcrumbs *b) ··· 3178 2416 static void guc_bump_inflight_request_prio(struct i915_request *rq, 3179 2417 int prio) 3180 2418 { 3181 - struct intel_context *ce = rq->context; 2419 + struct intel_context *ce = request_to_scheduling_context(rq); 3182 2420 u8 new_guc_prio = map_i915_prio_to_guc_prio(prio); 3183 2421 3184 2422 /* Short circuit function */ ··· 3201 2439 3202 2440 static void guc_retire_inflight_request_prio(struct i915_request *rq) 3203 2441 { 3204 - struct intel_context *ce = rq->context; 2442 + struct intel_context *ce = request_to_scheduling_context(rq); 3205 2443 3206 2444 spin_lock(&ce->guc_state.lock); 3207 2445 guc_prio_fini(rq, ce); ··· 3515 2753 return NULL; 3516 2754 } 3517 2755 2756 + if (unlikely(intel_context_is_child(ce))) { 2757 + drm_err(&guc_to_gt(guc)->i915->drm, 2758 + "Context is child, desc_idx %u", desc_idx); 2759 + return NULL; 2760 + } 2761 + 3518 2762 return ce; 3519 2763 } 3520 2764 ··· 3564 2796 intel_context_put(ce); 3565 2797 } else if (context_destroyed(ce)) { 3566 2798 /* Context has been destroyed */ 2799 + intel_gt_pm_put_async(guc_to_gt(guc)); 3567 2800 release_guc_id(guc, ce); 3568 2801 __guc_context_destroy(ce); 3569 2802 } ··· 3891 3122 drm_printf(p, "\n"); 3892 3123 } 3893 3124 3125 + static inline void guc_log_context(struct drm_printer *p, 3126 + struct intel_context *ce) 3127 + { 3128 + drm_printf(p, "GuC lrc descriptor %u:\n", ce->guc_id.id); 3129 + drm_printf(p, "\tHW Context Desc: 0x%08x\n", ce->lrc.lrca); 3130 + drm_printf(p, "\t\tLRC Head: Internal %u, Memory %u\n", 3131 + ce->ring->head, 3132 + ce->lrc_reg_state[CTX_RING_HEAD]); 3133 + drm_printf(p, "\t\tLRC Tail: Internal %u, Memory %u\n", 3134 + ce->ring->tail, 3135 + ce->lrc_reg_state[CTX_RING_TAIL]); 3136 + drm_printf(p, "\t\tContext Pin Count: %u\n", 3137 + atomic_read(&ce->pin_count)); 3138 + drm_printf(p, "\t\tGuC ID Ref Count: %u\n", 3139 + atomic_read(&ce->guc_id.ref)); 3140 + drm_printf(p, "\t\tSchedule State: 0x%x\n\n", 3141 + ce->guc_state.sched_state); 3142 + } 3143 + 3894 3144 void intel_guc_submission_print_context_info(struct intel_guc *guc, 3895 3145 struct drm_printer *p) 3896 3146 { ··· 3919 3131 3920 3132 xa_lock_irqsave(&guc->context_lookup, flags); 3921 3133 xa_for_each(&guc->context_lookup, index, ce) { 3922 - drm_printf(p, "GuC lrc descriptor %u:\n", ce->guc_id.id); 3923 - drm_printf(p, "\tHW Context Desc: 0x%08x\n", ce->lrc.lrca); 3924 - drm_printf(p, "\t\tLRC Head: Internal %u, Memory %u\n", 3925 - ce->ring->head, 3926 - ce->lrc_reg_state[CTX_RING_HEAD]); 3927 - drm_printf(p, "\t\tLRC Tail: Internal %u, Memory %u\n", 3928 - ce->ring->tail, 3929 - ce->lrc_reg_state[CTX_RING_TAIL]); 3930 - drm_printf(p, "\t\tContext Pin Count: %u\n", 3931 - atomic_read(&ce->pin_count)); 3932 - drm_printf(p, "\t\tGuC ID Ref Count: %u\n", 3933 - atomic_read(&ce->guc_id.ref)); 3934 - drm_printf(p, "\t\tSchedule State: 0x%x\n\n", 3935 - ce->guc_state.sched_state); 3134 + GEM_BUG_ON(intel_context_is_child(ce)); 3936 3135 3136 + guc_log_context(p, ce); 3937 3137 guc_log_context_priority(p, ce); 3138 + 3139 + if (intel_context_is_parent(ce)) { 3140 + struct guc_process_desc *desc = __get_process_desc(ce); 3141 + struct intel_context *child; 3142 + 3143 + drm_printf(p, "\t\tNumber children: %u\n", 3144 + ce->parallel.number_children); 3145 + drm_printf(p, "\t\tWQI Head: %u\n", 3146 + READ_ONCE(desc->head)); 3147 + drm_printf(p, "\t\tWQI Tail: %u\n", 3148 + READ_ONCE(desc->tail)); 3149 + drm_printf(p, "\t\tWQI Status: %u\n\n", 3150 + READ_ONCE(desc->wq_status)); 3151 + 3152 + if (ce->engine->emit_bb_start == 3153 + emit_bb_start_parent_no_preempt_mid_batch) { 3154 + u8 i; 3155 + 3156 + drm_printf(p, "\t\tChildren Go: %u\n\n", 3157 + get_children_go_value(ce)); 3158 + for (i = 0; i < ce->parallel.number_children; ++i) 3159 + drm_printf(p, "\t\tChildren Join: %u\n", 3160 + get_children_join_value(ce, i)); 3161 + } 3162 + 3163 + for_each_child(ce, child) 3164 + guc_log_context(p, child); 3165 + } 3938 3166 } 3939 3167 xa_unlock_irqrestore(&guc->context_lookup, flags); 3940 3168 } 3941 3169 3170 + static inline u32 get_children_go_addr(struct intel_context *ce) 3171 + { 3172 + GEM_BUG_ON(!intel_context_is_parent(ce)); 3173 + 3174 + return i915_ggtt_offset(ce->state) + 3175 + __get_parent_scratch_offset(ce) + 3176 + offsetof(struct parent_scratch, go.semaphore); 3177 + } 3178 + 3179 + static inline u32 get_children_join_addr(struct intel_context *ce, 3180 + u8 child_index) 3181 + { 3182 + GEM_BUG_ON(!intel_context_is_parent(ce)); 3183 + 3184 + return i915_ggtt_offset(ce->state) + 3185 + __get_parent_scratch_offset(ce) + 3186 + offsetof(struct parent_scratch, join[child_index].semaphore); 3187 + } 3188 + 3189 + #define PARENT_GO_BB 1 3190 + #define PARENT_GO_FINI_BREADCRUMB 0 3191 + #define CHILD_GO_BB 1 3192 + #define CHILD_GO_FINI_BREADCRUMB 0 3193 + static int emit_bb_start_parent_no_preempt_mid_batch(struct i915_request *rq, 3194 + u64 offset, u32 len, 3195 + const unsigned int flags) 3196 + { 3197 + struct intel_context *ce = rq->context; 3198 + u32 *cs; 3199 + u8 i; 3200 + 3201 + GEM_BUG_ON(!intel_context_is_parent(ce)); 3202 + 3203 + cs = intel_ring_begin(rq, 10 + 4 * ce->parallel.number_children); 3204 + if (IS_ERR(cs)) 3205 + return PTR_ERR(cs); 3206 + 3207 + /* Wait on children */ 3208 + for (i = 0; i < ce->parallel.number_children; ++i) { 3209 + *cs++ = (MI_SEMAPHORE_WAIT | 3210 + MI_SEMAPHORE_GLOBAL_GTT | 3211 + MI_SEMAPHORE_POLL | 3212 + MI_SEMAPHORE_SAD_EQ_SDD); 3213 + *cs++ = PARENT_GO_BB; 3214 + *cs++ = get_children_join_addr(ce, i); 3215 + *cs++ = 0; 3216 + } 3217 + 3218 + /* Turn off preemption */ 3219 + *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3220 + *cs++ = MI_NOOP; 3221 + 3222 + /* Tell children go */ 3223 + cs = gen8_emit_ggtt_write(cs, 3224 + CHILD_GO_BB, 3225 + get_children_go_addr(ce), 3226 + 0); 3227 + 3228 + /* Jump to batch */ 3229 + *cs++ = MI_BATCH_BUFFER_START_GEN8 | 3230 + (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); 3231 + *cs++ = lower_32_bits(offset); 3232 + *cs++ = upper_32_bits(offset); 3233 + *cs++ = MI_NOOP; 3234 + 3235 + intel_ring_advance(rq, cs); 3236 + 3237 + return 0; 3238 + } 3239 + 3240 + static int emit_bb_start_child_no_preempt_mid_batch(struct i915_request *rq, 3241 + u64 offset, u32 len, 3242 + const unsigned int flags) 3243 + { 3244 + struct intel_context *ce = rq->context; 3245 + struct intel_context *parent = intel_context_to_parent(ce); 3246 + u32 *cs; 3247 + 3248 + GEM_BUG_ON(!intel_context_is_child(ce)); 3249 + 3250 + cs = intel_ring_begin(rq, 12); 3251 + if (IS_ERR(cs)) 3252 + return PTR_ERR(cs); 3253 + 3254 + /* Signal parent */ 3255 + cs = gen8_emit_ggtt_write(cs, 3256 + PARENT_GO_BB, 3257 + get_children_join_addr(parent, 3258 + ce->parallel.child_index), 3259 + 0); 3260 + 3261 + /* Wait on parent for go */ 3262 + *cs++ = (MI_SEMAPHORE_WAIT | 3263 + MI_SEMAPHORE_GLOBAL_GTT | 3264 + MI_SEMAPHORE_POLL | 3265 + MI_SEMAPHORE_SAD_EQ_SDD); 3266 + *cs++ = CHILD_GO_BB; 3267 + *cs++ = get_children_go_addr(parent); 3268 + *cs++ = 0; 3269 + 3270 + /* Turn off preemption */ 3271 + *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3272 + 3273 + /* Jump to batch */ 3274 + *cs++ = MI_BATCH_BUFFER_START_GEN8 | 3275 + (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); 3276 + *cs++ = lower_32_bits(offset); 3277 + *cs++ = upper_32_bits(offset); 3278 + 3279 + intel_ring_advance(rq, cs); 3280 + 3281 + return 0; 3282 + } 3283 + 3284 + static u32 * 3285 + __emit_fini_breadcrumb_parent_no_preempt_mid_batch(struct i915_request *rq, 3286 + u32 *cs) 3287 + { 3288 + struct intel_context *ce = rq->context; 3289 + u8 i; 3290 + 3291 + GEM_BUG_ON(!intel_context_is_parent(ce)); 3292 + 3293 + /* Wait on children */ 3294 + for (i = 0; i < ce->parallel.number_children; ++i) { 3295 + *cs++ = (MI_SEMAPHORE_WAIT | 3296 + MI_SEMAPHORE_GLOBAL_GTT | 3297 + MI_SEMAPHORE_POLL | 3298 + MI_SEMAPHORE_SAD_EQ_SDD); 3299 + *cs++ = PARENT_GO_FINI_BREADCRUMB; 3300 + *cs++ = get_children_join_addr(ce, i); 3301 + *cs++ = 0; 3302 + } 3303 + 3304 + /* Turn on preemption */ 3305 + *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3306 + *cs++ = MI_NOOP; 3307 + 3308 + /* Tell children go */ 3309 + cs = gen8_emit_ggtt_write(cs, 3310 + CHILD_GO_FINI_BREADCRUMB, 3311 + get_children_go_addr(ce), 3312 + 0); 3313 + 3314 + return cs; 3315 + } 3316 + 3317 + /* 3318 + * If this true, a submission of multi-lrc requests had an error and the 3319 + * requests need to be skipped. The front end (execuf IOCTL) should've called 3320 + * i915_request_skip which squashes the BB but we still need to emit the fini 3321 + * breadrcrumbs seqno write. At this point we don't know how many of the 3322 + * requests in the multi-lrc submission were generated so we can't do the 3323 + * handshake between the parent and children (e.g. if 4 requests should be 3324 + * generated but 2nd hit an error only 1 would be seen by the GuC backend). 3325 + * Simply skip the handshake, but still emit the breadcrumbd seqno, if an error 3326 + * has occurred on any of the requests in submission / relationship. 3327 + */ 3328 + static inline bool skip_handshake(struct i915_request *rq) 3329 + { 3330 + return test_bit(I915_FENCE_FLAG_SKIP_PARALLEL, &rq->fence.flags); 3331 + } 3332 + 3333 + static u32 * 3334 + emit_fini_breadcrumb_parent_no_preempt_mid_batch(struct i915_request *rq, 3335 + u32 *cs) 3336 + { 3337 + struct intel_context *ce = rq->context; 3338 + 3339 + GEM_BUG_ON(!intel_context_is_parent(ce)); 3340 + 3341 + if (unlikely(skip_handshake(rq))) { 3342 + /* 3343 + * NOP everything in __emit_fini_breadcrumb_parent_no_preempt_mid_batch, 3344 + * the -6 comes from the length of the emits below. 3345 + */ 3346 + memset(cs, 0, sizeof(u32) * 3347 + (ce->engine->emit_fini_breadcrumb_dw - 6)); 3348 + cs += ce->engine->emit_fini_breadcrumb_dw - 6; 3349 + } else { 3350 + cs = __emit_fini_breadcrumb_parent_no_preempt_mid_batch(rq, cs); 3351 + } 3352 + 3353 + /* Emit fini breadcrumb */ 3354 + cs = gen8_emit_ggtt_write(cs, 3355 + rq->fence.seqno, 3356 + i915_request_active_timeline(rq)->hwsp_offset, 3357 + 0); 3358 + 3359 + /* User interrupt */ 3360 + *cs++ = MI_USER_INTERRUPT; 3361 + *cs++ = MI_NOOP; 3362 + 3363 + rq->tail = intel_ring_offset(rq, cs); 3364 + 3365 + return cs; 3366 + } 3367 + 3368 + static u32 * 3369 + __emit_fini_breadcrumb_child_no_preempt_mid_batch(struct i915_request *rq, 3370 + u32 *cs) 3371 + { 3372 + struct intel_context *ce = rq->context; 3373 + struct intel_context *parent = intel_context_to_parent(ce); 3374 + 3375 + GEM_BUG_ON(!intel_context_is_child(ce)); 3376 + 3377 + /* Turn on preemption */ 3378 + *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3379 + *cs++ = MI_NOOP; 3380 + 3381 + /* Signal parent */ 3382 + cs = gen8_emit_ggtt_write(cs, 3383 + PARENT_GO_FINI_BREADCRUMB, 3384 + get_children_join_addr(parent, 3385 + ce->parallel.child_index), 3386 + 0); 3387 + 3388 + /* Wait parent on for go */ 3389 + *cs++ = (MI_SEMAPHORE_WAIT | 3390 + MI_SEMAPHORE_GLOBAL_GTT | 3391 + MI_SEMAPHORE_POLL | 3392 + MI_SEMAPHORE_SAD_EQ_SDD); 3393 + *cs++ = CHILD_GO_FINI_BREADCRUMB; 3394 + *cs++ = get_children_go_addr(parent); 3395 + *cs++ = 0; 3396 + 3397 + return cs; 3398 + } 3399 + 3400 + static u32 * 3401 + emit_fini_breadcrumb_child_no_preempt_mid_batch(struct i915_request *rq, 3402 + u32 *cs) 3403 + { 3404 + struct intel_context *ce = rq->context; 3405 + 3406 + GEM_BUG_ON(!intel_context_is_child(ce)); 3407 + 3408 + if (unlikely(skip_handshake(rq))) { 3409 + /* 3410 + * NOP everything in __emit_fini_breadcrumb_child_no_preempt_mid_batch, 3411 + * the -6 comes from the length of the emits below. 3412 + */ 3413 + memset(cs, 0, sizeof(u32) * 3414 + (ce->engine->emit_fini_breadcrumb_dw - 6)); 3415 + cs += ce->engine->emit_fini_breadcrumb_dw - 6; 3416 + } else { 3417 + cs = __emit_fini_breadcrumb_child_no_preempt_mid_batch(rq, cs); 3418 + } 3419 + 3420 + /* Emit fini breadcrumb */ 3421 + cs = gen8_emit_ggtt_write(cs, 3422 + rq->fence.seqno, 3423 + i915_request_active_timeline(rq)->hwsp_offset, 3424 + 0); 3425 + 3426 + /* User interrupt */ 3427 + *cs++ = MI_USER_INTERRUPT; 3428 + *cs++ = MI_NOOP; 3429 + 3430 + rq->tail = intel_ring_offset(rq, cs); 3431 + 3432 + return cs; 3433 + } 3434 + 3942 3435 static struct intel_context * 3943 - guc_create_virtual(struct intel_engine_cs **siblings, unsigned int count) 3436 + guc_create_virtual(struct intel_engine_cs **siblings, unsigned int count, 3437 + unsigned long flags) 3944 3438 { 3945 3439 struct guc_virtual_engine *ve; 3946 3440 struct intel_guc *guc; ··· 4271 3201 } 4272 3202 4273 3203 ve->base.mask |= sibling->mask; 3204 + ve->base.logical_mask |= sibling->logical_mask; 4274 3205 4275 3206 if (n != 0 && ve->base.class != sibling->class) { 4276 3207 DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n", ··· 4330 3259 4331 3260 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 4332 3261 #include "selftest_guc.c" 3262 + #include "selftest_guc_multi_lrc.c" 4333 3263 #endif

+179

drivers/gpu/drm/i915/gt/uc/selftest_guc_multi_lrc.c

··· 1 + // SPDX-License-Identifier: MIT 2 + /* 3 + * Copyright �� 2019 Intel Corporation 4 + */ 5 + 6 + #include "selftests/igt_spinner.h" 7 + #include "selftests/igt_reset.h" 8 + #include "selftests/intel_scheduler_helpers.h" 9 + #include "gt/intel_engine_heartbeat.h" 10 + #include "gem/selftests/mock_context.h" 11 + 12 + static void logical_sort(struct intel_engine_cs **engines, int num_engines) 13 + { 14 + struct intel_engine_cs *sorted[MAX_ENGINE_INSTANCE + 1]; 15 + int i, j; 16 + 17 + for (i = 0; i < num_engines; ++i) 18 + for (j = 0; j < MAX_ENGINE_INSTANCE + 1; ++j) { 19 + if (engines[j]->logical_mask & BIT(i)) { 20 + sorted[i] = engines[j]; 21 + break; 22 + } 23 + } 24 + 25 + memcpy(*engines, *sorted, 26 + sizeof(struct intel_engine_cs *) * num_engines); 27 + } 28 + 29 + static struct intel_context * 30 + multi_lrc_create_parent(struct intel_gt *gt, u8 class, 31 + unsigned long flags) 32 + { 33 + struct intel_engine_cs *siblings[MAX_ENGINE_INSTANCE + 1]; 34 + struct intel_engine_cs *engine; 35 + enum intel_engine_id id; 36 + int i = 0; 37 + 38 + for_each_engine(engine, gt, id) { 39 + if (engine->class != class) 40 + continue; 41 + 42 + siblings[i++] = engine; 43 + } 44 + 45 + if (i <= 1) 46 + return ERR_PTR(0); 47 + 48 + logical_sort(siblings, i); 49 + 50 + return intel_engine_create_parallel(siblings, 1, i); 51 + } 52 + 53 + static void multi_lrc_context_unpin(struct intel_context *ce) 54 + { 55 + struct intel_context *child; 56 + 57 + GEM_BUG_ON(!intel_context_is_parent(ce)); 58 + 59 + for_each_child(ce, child) 60 + intel_context_unpin(child); 61 + intel_context_unpin(ce); 62 + } 63 + 64 + static void multi_lrc_context_put(struct intel_context *ce) 65 + { 66 + GEM_BUG_ON(!intel_context_is_parent(ce)); 67 + 68 + /* 69 + * Only the parent gets the creation ref put in the uAPI, the parent 70 + * itself is responsible for creation ref put on the children. 71 + */ 72 + intel_context_put(ce); 73 + } 74 + 75 + static struct i915_request * 76 + multi_lrc_nop_request(struct intel_context *ce) 77 + { 78 + struct intel_context *child; 79 + struct i915_request *rq, *child_rq; 80 + int i = 0; 81 + 82 + GEM_BUG_ON(!intel_context_is_parent(ce)); 83 + 84 + rq = intel_context_create_request(ce); 85 + if (IS_ERR(rq)) 86 + return rq; 87 + 88 + i915_request_get(rq); 89 + i915_request_add(rq); 90 + 91 + for_each_child(ce, child) { 92 + child_rq = intel_context_create_request(child); 93 + if (IS_ERR(child_rq)) 94 + goto child_error; 95 + 96 + if (++i == ce->parallel.number_children) 97 + set_bit(I915_FENCE_FLAG_SUBMIT_PARALLEL, 98 + &child_rq->fence.flags); 99 + i915_request_add(child_rq); 100 + } 101 + 102 + return rq; 103 + 104 + child_error: 105 + i915_request_put(rq); 106 + 107 + return ERR_PTR(-ENOMEM); 108 + } 109 + 110 + static int __intel_guc_multi_lrc_basic(struct intel_gt *gt, unsigned int class) 111 + { 112 + struct intel_context *parent; 113 + struct i915_request *rq; 114 + int ret; 115 + 116 + parent = multi_lrc_create_parent(gt, class, 0); 117 + if (IS_ERR(parent)) { 118 + pr_err("Failed creating contexts: %ld", PTR_ERR(parent)); 119 + return PTR_ERR(parent); 120 + } else if (!parent) { 121 + pr_debug("Not enough engines in class: %d", class); 122 + return 0; 123 + } 124 + 125 + rq = multi_lrc_nop_request(parent); 126 + if (IS_ERR(rq)) { 127 + ret = PTR_ERR(rq); 128 + pr_err("Failed creating requests: %d", ret); 129 + goto out; 130 + } 131 + 132 + ret = intel_selftest_wait_for_rq(rq); 133 + if (ret) 134 + pr_err("Failed waiting on request: %d", ret); 135 + 136 + i915_request_put(rq); 137 + 138 + if (ret >= 0) { 139 + ret = intel_gt_wait_for_idle(gt, HZ * 5); 140 + if (ret < 0) 141 + pr_err("GT failed to idle: %d\n", ret); 142 + } 143 + 144 + out: 145 + multi_lrc_context_unpin(parent); 146 + multi_lrc_context_put(parent); 147 + return ret; 148 + } 149 + 150 + static int intel_guc_multi_lrc_basic(void *arg) 151 + { 152 + struct intel_gt *gt = arg; 153 + unsigned int class; 154 + int ret; 155 + 156 + for (class = 0; class < MAX_ENGINE_CLASS + 1; ++class) { 157 + ret = __intel_guc_multi_lrc_basic(gt, class); 158 + if (ret) 159 + return ret; 160 + } 161 + 162 + return 0; 163 + } 164 + 165 + int intel_guc_multi_lrc_live_selftests(struct drm_i915_private *i915) 166 + { 167 + static const struct i915_subtest tests[] = { 168 + SUBTEST(intel_guc_multi_lrc_basic), 169 + }; 170 + struct intel_gt *gt = &i915->gt; 171 + 172 + if (intel_gt_is_wedged(gt)) 173 + return 0; 174 + 175 + if (!intel_uc_uses_guc_submission(&gt->uc)) 176 + return 0; 177 + 178 + return intel_gt_live_subtests(tests, gt); 179 + }

+7 -36

drivers/gpu/drm/i915/i915_debugfs.c

··· 35 35 #include "gt/intel_gt.h" 36 36 #include "gt/intel_gt_buffer_pool.h" 37 37 #include "gt/intel_gt_clock_utils.h" 38 + #include "gt/intel_gt_debugfs.h" 38 39 #include "gt/intel_gt_pm.h" 39 40 #include "gt/intel_gt_pm_debugfs.h" 40 41 #include "gt/intel_gt_requests.h" ··· 554 553 return 0; 555 554 } 556 555 557 - static int 558 - i915_wedged_get(void *data, u64 *val) 556 + static int i915_wedged_get(void *data, u64 *val) 559 557 { 560 558 struct drm_i915_private *i915 = data; 561 - int ret = intel_gt_terminally_wedged(&i915->gt); 562 559 563 - switch (ret) { 564 - case -EIO: 565 - *val = 1; 566 - return 0; 567 - case 0: 568 - *val = 0; 569 - return 0; 570 - default: 571 - return ret; 572 - } 560 + return intel_gt_debugfs_reset_show(&i915->gt, val); 573 561 } 574 562 575 - static int 576 - i915_wedged_set(void *data, u64 val) 563 + static int i915_wedged_set(void *data, u64 val) 577 564 { 578 565 struct drm_i915_private *i915 = data; 579 566 580 - /* Flush any previous reset before applying for a new one */ 581 - wait_event(i915->gt.reset.queue, 582 - !test_bit(I915_RESET_BACKOFF, &i915->gt.reset.flags)); 583 - 584 - intel_gt_handle_error(&i915->gt, val, I915_ERROR_CAPTURE, 585 - "Manually set wedged engine mask = %llx", val); 586 - return 0; 567 + return intel_gt_debugfs_reset_store(&i915->gt, val); 587 568 } 588 569 589 570 DEFINE_SIMPLE_ATTRIBUTE(i915_wedged_fops, ··· 710 727 static int i915_forcewake_open(struct inode *inode, struct file *file) 711 728 { 712 729 struct drm_i915_private *i915 = inode->i_private; 713 - struct intel_gt *gt = &i915->gt; 714 730 715 - atomic_inc(&gt->user_wakeref); 716 - intel_gt_pm_get(gt); 717 - if (GRAPHICS_VER(i915) >= 6) 718 - intel_uncore_forcewake_user_get(gt->uncore); 719 - 720 - return 0; 731 + return intel_gt_pm_debugfs_forcewake_user_open(&i915->gt); 721 732 } 722 733 723 734 static int i915_forcewake_release(struct inode *inode, struct file *file) 724 735 { 725 736 struct drm_i915_private *i915 = inode->i_private; 726 - struct intel_gt *gt = &i915->gt; 727 737 728 - if (GRAPHICS_VER(i915) >= 6) 729 - intel_uncore_forcewake_user_put(&i915->uncore); 730 - intel_gt_pm_put(gt); 731 - atomic_dec(&gt->user_wakeref); 732 - 733 - return 0; 738 + return intel_gt_pm_debugfs_forcewake_user_release(&i915->gt); 734 739 } 735 740 736 741 static const struct file_operations i915_forcewake_fops = {

+2

drivers/gpu/drm/i915/i915_query.c

··· 124 124 for_each_uabi_engine(engine, i915) { 125 125 info.engine.engine_class = engine->uabi_class; 126 126 info.engine.engine_instance = engine->uabi_instance; 127 + info.flags = I915_ENGINE_INFO_HAS_LOGICAL_INSTANCE; 127 128 info.capabilities = engine->uabi_capabilities; 129 + info.logical_instance = ilog2(engine->logical_mask); 128 130 129 131 if (copy_to_user(info_ptr, &info, sizeof(info))) 130 132 return -EFAULT;

+118 -35

drivers/gpu/drm/i915/i915_request.c

··· 1335 1335 return err; 1336 1336 } 1337 1337 1338 + static inline bool is_parallel_rq(struct i915_request *rq) 1339 + { 1340 + return intel_context_is_parallel(rq->context); 1341 + } 1342 + 1343 + static inline struct intel_context *request_to_parent(struct i915_request *rq) 1344 + { 1345 + return intel_context_to_parent(rq->context); 1346 + } 1347 + 1348 + static bool is_same_parallel_context(struct i915_request *to, 1349 + struct i915_request *from) 1350 + { 1351 + if (is_parallel_rq(to)) 1352 + return request_to_parent(to) == request_to_parent(from); 1353 + 1354 + return false; 1355 + } 1356 + 1338 1357 int 1339 1358 i915_request_await_execution(struct i915_request *rq, 1340 1359 struct dma_fence *fence) ··· 1385 1366 * want to run our callback in all cases. 1386 1367 */ 1387 1368 1388 - if (dma_fence_is_i915(fence)) 1369 + if (dma_fence_is_i915(fence)) { 1370 + if (is_same_parallel_context(rq, to_request(fence))) 1371 + continue; 1389 1372 ret = __i915_request_await_execution(rq, 1390 1373 to_request(fence)); 1391 - else 1374 + } else { 1392 1375 ret = i915_request_await_external(rq, fence); 1376 + } 1393 1377 if (ret < 0) 1394 1378 return ret; 1395 1379 } while (--nchild); ··· 1493 1471 fence)) 1494 1472 continue; 1495 1473 1496 - if (dma_fence_is_i915(fence)) 1474 + if (dma_fence_is_i915(fence)) { 1475 + if (is_same_parallel_context(rq, to_request(fence))) 1476 + continue; 1497 1477 ret = i915_request_await_request(rq, to_request(fence)); 1498 - else 1478 + } else { 1499 1479 ret = i915_request_await_external(rq, fence); 1480 + } 1500 1481 if (ret < 0) 1501 1482 return ret; 1502 1483 ··· 1575 1550 } 1576 1551 1577 1552 static struct i915_request * 1553 + __i915_request_ensure_parallel_ordering(struct i915_request *rq, 1554 + struct intel_timeline *timeline) 1555 + { 1556 + struct i915_request *prev; 1557 + 1558 + GEM_BUG_ON(!is_parallel_rq(rq)); 1559 + 1560 + prev = request_to_parent(rq)->parallel.last_rq; 1561 + if (prev) { 1562 + if (!__i915_request_is_complete(prev)) { 1563 + i915_sw_fence_await_sw_fence(&rq->submit, 1564 + &prev->submit, 1565 + &rq->submitq); 1566 + 1567 + if (rq->engine->sched_engine->schedule) 1568 + __i915_sched_node_add_dependency(&rq->sched, 1569 + &prev->sched, 1570 + &rq->dep, 1571 + 0); 1572 + } 1573 + i915_request_put(prev); 1574 + } 1575 + 1576 + request_to_parent(rq)->parallel.last_rq = i915_request_get(rq); 1577 + 1578 + return to_request(__i915_active_fence_set(&timeline->last_request, 1579 + &rq->fence)); 1580 + } 1581 + 1582 + static struct i915_request * 1583 + __i915_request_ensure_ordering(struct i915_request *rq, 1584 + struct intel_timeline *timeline) 1585 + { 1586 + struct i915_request *prev; 1587 + 1588 + GEM_BUG_ON(is_parallel_rq(rq)); 1589 + 1590 + prev = to_request(__i915_active_fence_set(&timeline->last_request, 1591 + &rq->fence)); 1592 + 1593 + if (prev && !__i915_request_is_complete(prev)) { 1594 + bool uses_guc = intel_engine_uses_guc(rq->engine); 1595 + bool pow2 = is_power_of_2(READ_ONCE(prev->engine)->mask | 1596 + rq->engine->mask); 1597 + bool same_context = prev->context == rq->context; 1598 + 1599 + /* 1600 + * The requests are supposed to be kept in order. However, 1601 + * we need to be wary in case the timeline->last_request 1602 + * is used as a barrier for external modification to this 1603 + * context. 1604 + */ 1605 + GEM_BUG_ON(same_context && 1606 + i915_seqno_passed(prev->fence.seqno, 1607 + rq->fence.seqno)); 1608 + 1609 + if ((same_context && uses_guc) || (!uses_guc && pow2)) 1610 + i915_sw_fence_await_sw_fence(&rq->submit, 1611 + &prev->submit, 1612 + &rq->submitq); 1613 + else 1614 + __i915_sw_fence_await_dma_fence(&rq->submit, 1615 + &prev->fence, 1616 + &rq->dmaq); 1617 + if (rq->engine->sched_engine->schedule) 1618 + __i915_sched_node_add_dependency(&rq->sched, 1619 + &prev->sched, 1620 + &rq->dep, 1621 + 0); 1622 + } 1623 + 1624 + return prev; 1625 + } 1626 + 1627 + static struct i915_request * 1578 1628 __i915_request_add_to_timeline(struct i915_request *rq) 1579 1629 { 1580 1630 struct intel_timeline *timeline = i915_request_timeline(rq); ··· 1674 1574 * complete (to maximise our greedy late load balancing) and this 1675 1575 * precludes optimising to use semaphores serialisation of a single 1676 1576 * timeline across engines. 1577 + * 1578 + * We do not order parallel submission requests on the timeline as each 1579 + * parallel submission context has its own timeline and the ordering 1580 + * rules for parallel requests are that they must be submitted in the 1581 + * order received from the execbuf IOCTL. So rather than using the 1582 + * timeline we store a pointer to last request submitted in the 1583 + * relationship in the gem context and insert a submission fence 1584 + * between that request and request passed into this function or 1585 + * alternatively we use completion fence if gem context has a single 1586 + * timeline and this is the first submission of an execbuf IOCTL. 1677 1587 */ 1678 - prev = to_request(__i915_active_fence_set(&timeline->last_request, 1679 - &rq->fence)); 1680 - if (prev && !__i915_request_is_complete(prev)) { 1681 - bool uses_guc = intel_engine_uses_guc(rq->engine); 1682 - 1683 - /* 1684 - * The requests are supposed to be kept in order. However, 1685 - * we need to be wary in case the timeline->last_request 1686 - * is used as a barrier for external modification to this 1687 - * context. 1688 - */ 1689 - GEM_BUG_ON(prev->context == rq->context && 1690 - i915_seqno_passed(prev->fence.seqno, 1691 - rq->fence.seqno)); 1692 - 1693 - if ((!uses_guc && 1694 - is_power_of_2(READ_ONCE(prev->engine)->mask | rq->engine->mask)) || 1695 - (uses_guc && prev->context == rq->context)) 1696 - i915_sw_fence_await_sw_fence(&rq->submit, 1697 - &prev->submit, 1698 - &rq->submitq); 1699 - else 1700 - __i915_sw_fence_await_dma_fence(&rq->submit, 1701 - &prev->fence, 1702 - &rq->dmaq); 1703 - if (rq->engine->sched_engine->schedule) 1704 - __i915_sched_node_add_dependency(&rq->sched, 1705 - &prev->sched, 1706 - &rq->dep, 1707 - 0); 1708 - } 1588 + if (likely(!is_parallel_rq(rq))) 1589 + prev = __i915_request_ensure_ordering(rq, timeline); 1590 + else 1591 + prev = __i915_request_ensure_parallel_ordering(rq, timeline); 1709 1592 1710 1593 /* 1711 1594 * Make sure that no request gazumped us - if it was allocated after

+23

drivers/gpu/drm/i915/i915_request.h

··· 139 139 * the GPU. Here we track such boost requests on a per-request basis. 140 140 */ 141 141 I915_FENCE_FLAG_BOOST, 142 + 143 + /* 144 + * I915_FENCE_FLAG_SUBMIT_PARALLEL - request with a context in a 145 + * parent-child relationship (parallel submission, multi-lrc) should 146 + * trigger a submission to the GuC rather than just moving the context 147 + * tail. 148 + */ 149 + I915_FENCE_FLAG_SUBMIT_PARALLEL, 150 + 151 + /* 152 + * I915_FENCE_FLAG_SKIP_PARALLEL - request with a context in a 153 + * parent-child relationship (parallel submission, multi-lrc) that 154 + * hit an error while generating requests in the execbuf IOCTL. 155 + * Indicates this request should be skipped as another request in 156 + * submission / relationship encoutered an error. 157 + */ 158 + I915_FENCE_FLAG_SKIP_PARALLEL, 159 + 160 + /* 161 + * I915_FENCE_FLAG_COMPOSITE - Indicates fence is part of a composite 162 + * fence (dma_fence_array) and i915 generated for parallel submission. 163 + */ 164 + I915_FENCE_FLAG_COMPOSITE, 142 165 }; 143 166 144 167 /**

+13 -8

drivers/gpu/drm/i915/i915_vma.c

··· 1234 1234 return i915_active_add_request(&vma->active, rq); 1235 1235 } 1236 1236 1237 - int i915_vma_move_to_active(struct i915_vma *vma, 1238 - struct i915_request *rq, 1239 - unsigned int flags) 1237 + int _i915_vma_move_to_active(struct i915_vma *vma, 1238 + struct i915_request *rq, 1239 + struct dma_fence *fence, 1240 + unsigned int flags) 1240 1241 { 1241 1242 struct drm_i915_gem_object *obj = vma->obj; 1242 1243 int err; ··· 1258 1257 intel_frontbuffer_put(front); 1259 1258 } 1260 1259 1261 - dma_resv_add_excl_fence(vma->resv, &rq->fence); 1262 - obj->write_domain = I915_GEM_DOMAIN_RENDER; 1263 - obj->read_domains = 0; 1260 + if (fence) { 1261 + dma_resv_add_excl_fence(vma->resv, fence); 1262 + obj->write_domain = I915_GEM_DOMAIN_RENDER; 1263 + obj->read_domains = 0; 1264 + } 1264 1265 } else { 1265 1266 if (!(flags & __EXEC_OBJECT_NO_RESERVE)) { 1266 1267 err = dma_resv_reserve_shared(vma->resv, 1); ··· 1270 1267 return err; 1271 1268 } 1272 1269 1273 - dma_resv_add_shared_fence(vma->resv, &rq->fence); 1274 - obj->write_domain = 0; 1270 + if (fence) { 1271 + dma_resv_add_shared_fence(vma->resv, fence); 1272 + obj->write_domain = 0; 1273 + } 1275 1274 } 1276 1275 1277 1276 if (flags & EXEC_OBJECT_NEEDS_FENCE && vma->fence)

+10 -3

drivers/gpu/drm/i915/i915_vma.h

··· 57 57 58 58 int __must_check __i915_vma_move_to_active(struct i915_vma *vma, 59 59 struct i915_request *rq); 60 - int __must_check i915_vma_move_to_active(struct i915_vma *vma, 61 - struct i915_request *rq, 62 - unsigned int flags); 60 + int __must_check _i915_vma_move_to_active(struct i915_vma *vma, 61 + struct i915_request *rq, 62 + struct dma_fence *fence, 63 + unsigned int flags); 64 + static inline int __must_check 65 + i915_vma_move_to_active(struct i915_vma *vma, struct i915_request *rq, 66 + unsigned int flags) 67 + { 68 + return _i915_vma_move_to_active(vma, rq, &rq->fence, flags); 69 + } 63 70 64 71 #define __i915_vma_flags(v) ((unsigned long *)&(v)->flags.counter) 65 72

+12

drivers/gpu/drm/i915/intel_wakeref.h

··· 123 123 __INTEL_WAKEREF_PUT_LAST_BIT__ 124 124 }; 125 125 126 + static inline void 127 + intel_wakeref_might_get(struct intel_wakeref *wf) 128 + { 129 + might_lock(&wf->mutex); 130 + } 131 + 126 132 /** 127 133 * intel_wakeref_put_flags: Release the wakeref 128 134 * @wf: the wakeref ··· 174 168 __intel_wakeref_put(wf, 175 169 INTEL_WAKEREF_PUT_ASYNC | 176 170 FIELD_PREP(INTEL_WAKEREF_PUT_DELAY, delay)); 171 + } 172 + 173 + static inline void 174 + intel_wakeref_might_put(struct intel_wakeref *wf) 175 + { 176 + might_lock(&wf->mutex); 177 177 } 178 178 179 179 /**

+1

drivers/gpu/drm/i915/selftests/i915_live_selftests.h

··· 48 48 selftest(perf, i915_perf_live_selftests) 49 49 selftest(slpc, intel_slpc_live_selftests) 50 50 selftest(guc, intel_guc_live_selftests) 51 + selftest(guc_multi_lrc, intel_guc_multi_lrc_live_selftests) 51 52 /* Here be dragons: keep last to run last! */ 52 53 selftest(late_gt_pm, intel_gt_pm_late_selftests)

-2

drivers/gpu/drm/i915/selftests/mock_region.c

··· 6 6 #include <drm/ttm/ttm_placement.h> 7 7 #include <linux/scatterlist.h> 8 8 9 - #include <drm/ttm/ttm_placement.h> 10 - 11 9 #include "gem/i915_gem_region.h" 12 10 #include "intel_memory_region.h" 13 11 #include "intel_region_ttm.h"

+138 -1

include/uapi/drm/i915_drm.h

··· 1830 1830 * Extensions: 1831 1831 * i915_context_engines_load_balance (I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE) 1832 1832 * i915_context_engines_bond (I915_CONTEXT_ENGINES_EXT_BOND) 1833 + * i915_context_engines_parallel_submit (I915_CONTEXT_ENGINES_EXT_PARALLEL_SUBMIT) 1833 1834 */ 1834 1835 #define I915_CONTEXT_PARAM_ENGINES 0xa 1835 1836 ··· 2106 2105 } __attribute__((packed)) name__ 2107 2106 2108 2107 /** 2108 + * struct i915_context_engines_parallel_submit - Configure engine for 2109 + * parallel submission. 2110 + * 2111 + * Setup a slot in the context engine map to allow multiple BBs to be submitted 2112 + * in a single execbuf IOCTL. Those BBs will then be scheduled to run on the GPU 2113 + * in parallel. Multiple hardware contexts are created internally in the i915 to 2114 + * run these BBs. Once a slot is configured for N BBs only N BBs can be 2115 + * submitted in each execbuf IOCTL and this is implicit behavior e.g. The user 2116 + * doesn't tell the execbuf IOCTL there are N BBs, the execbuf IOCTL knows how 2117 + * many BBs there are based on the slot's configuration. The N BBs are the last 2118 + * N buffer objects or first N if I915_EXEC_BATCH_FIRST is set. 2119 + * 2120 + * The default placement behavior is to create implicit bonds between each 2121 + * context if each context maps to more than 1 physical engine (e.g. context is 2122 + * a virtual engine). Also we only allow contexts of same engine class and these 2123 + * contexts must be in logically contiguous order. Examples of the placement 2124 + * behavior are described below. Lastly, the default is to not allow BBs to be 2125 + * preempted mid-batch. Rather insert coordinated preemption points on all 2126 + * hardware contexts between each set of BBs. Flags could be added in the future 2127 + * to change both of these default behaviors. 2128 + * 2129 + * Returns -EINVAL if hardware context placement configuration is invalid or if 2130 + * the placement configuration isn't supported on the platform / submission 2131 + * interface. 2132 + * Returns -ENODEV if extension isn't supported on the platform / submission 2133 + * interface. 2134 + * 2135 + * .. code-block:: none 2136 + * 2137 + * Examples syntax: 2138 + * CS[X] = generic engine of same class, logical instance X 2139 + * INVALID = I915_ENGINE_CLASS_INVALID, I915_ENGINE_CLASS_INVALID_NONE 2140 + * 2141 + * Example 1 pseudo code: 2142 + * set_engines(INVALID) 2143 + * set_parallel(engine_index=0, width=2, num_siblings=1, 2144 + * engines=CS[0],CS[1]) 2145 + * 2146 + * Results in the following valid placement: 2147 + * CS[0], CS[1] 2148 + * 2149 + * Example 2 pseudo code: 2150 + * set_engines(INVALID) 2151 + * set_parallel(engine_index=0, width=2, num_siblings=2, 2152 + * engines=CS[0],CS[2],CS[1],CS[3]) 2153 + * 2154 + * Results in the following valid placements: 2155 + * CS[0], CS[1] 2156 + * CS[2], CS[3] 2157 + * 2158 + * This can be thought of as two virtual engines, each containing two 2159 + * engines thereby making a 2D array. However, there are bonds tying the 2160 + * entries together and placing restrictions on how they can be scheduled. 2161 + * Specifically, the scheduler can choose only vertical columns from the 2D 2162 + * array. That is, CS[0] is bonded to CS[1] and CS[2] to CS[3]. So if the 2163 + * scheduler wants to submit to CS[0], it must also choose CS[1] and vice 2164 + * versa. Same for CS[2] requires also using CS[3]. 2165 + * VE[0] = CS[0], CS[2] 2166 + * VE[1] = CS[1], CS[3] 2167 + * 2168 + * Example 3 pseudo code: 2169 + * set_engines(INVALID) 2170 + * set_parallel(engine_index=0, width=2, num_siblings=2, 2171 + * engines=CS[0],CS[1],CS[1],CS[3]) 2172 + * 2173 + * Results in the following valid and invalid placements: 2174 + * CS[0], CS[1] 2175 + * CS[1], CS[3] - Not logically contiguous, return -EINVAL 2176 + */ 2177 + struct i915_context_engines_parallel_submit { 2178 + /** 2179 + * @base: base user extension. 2180 + */ 2181 + struct i915_user_extension base; 2182 + 2183 + /** 2184 + * @engine_index: slot for parallel engine 2185 + */ 2186 + __u16 engine_index; 2187 + 2188 + /** 2189 + * @width: number of contexts per parallel engine or in other words the 2190 + * number of batches in each submission 2191 + */ 2192 + __u16 width; 2193 + 2194 + /** 2195 + * @num_siblings: number of siblings per context or in other words the 2196 + * number of possible placements for each submission 2197 + */ 2198 + __u16 num_siblings; 2199 + 2200 + /** 2201 + * @mbz16: reserved for future use; must be zero 2202 + */ 2203 + __u16 mbz16; 2204 + 2205 + /** 2206 + * @flags: all undefined flags must be zero, currently not defined flags 2207 + */ 2208 + __u64 flags; 2209 + 2210 + /** 2211 + * @mbz64: reserved for future use; must be zero 2212 + */ 2213 + __u64 mbz64[3]; 2214 + 2215 + /** 2216 + * @engines: 2-d array of engine instances to configure parallel engine 2217 + * 2218 + * length = width (i) * num_siblings (j) 2219 + * index = j + i * num_siblings 2220 + */ 2221 + struct i915_engine_class_instance engines[0]; 2222 + 2223 + } __packed; 2224 + 2225 + #define I915_DEFINE_CONTEXT_ENGINES_PARALLEL_SUBMIT(name__, N__) struct { \ 2226 + struct i915_user_extension base; \ 2227 + __u16 engine_index; \ 2228 + __u16 width; \ 2229 + __u16 num_siblings; \ 2230 + __u16 mbz16; \ 2231 + __u64 flags; \ 2232 + __u64 mbz64[3]; \ 2233 + struct i915_engine_class_instance engines[N__]; \ 2234 + } __attribute__((packed)) name__ 2235 + 2236 + /** 2109 2237 * DOC: Context Engine Map uAPI 2110 2238 * 2111 2239 * Context engine map is a new way of addressing engines when submitting batch- ··· 2293 2163 __u64 extensions; /* linked chain of extension blocks, 0 terminates */ 2294 2164 #define I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE 0 /* see i915_context_engines_load_balance */ 2295 2165 #define I915_CONTEXT_ENGINES_EXT_BOND 1 /* see i915_context_engines_bond */ 2166 + #define I915_CONTEXT_ENGINES_EXT_PARALLEL_SUBMIT 2 /* see i915_context_engines_parallel_submit */ 2296 2167 struct i915_engine_class_instance engines[0]; 2297 2168 } __attribute__((packed)); 2298 2169 ··· 2912 2781 2913 2782 /** @flags: Engine flags. */ 2914 2783 __u64 flags; 2784 + #define I915_ENGINE_INFO_HAS_LOGICAL_INSTANCE (1 << 0) 2915 2785 2916 2786 /** @capabilities: Capabilities of this engine. */ 2917 2787 __u64 capabilities; 2918 2788 #define I915_VIDEO_CLASS_CAPABILITY_HEVC (1 << 0) 2919 2789 #define I915_VIDEO_AND_ENHANCE_CLASS_CAPABILITY_SFC (1 << 1) 2920 2790 2791 + /** @logical_instance: Logical instance of engine */ 2792 + __u16 logical_instance; 2793 + 2921 2794 /** @rsvd1: Reserved fields. */ 2922 - __u64 rsvd1[4]; 2795 + __u16 rsvd1[3]; 2796 + /** @rsvd2: Reserved fields. */ 2797 + __u64 rsvd2[3]; 2923 2798 }; 2924 2799 2925 2800 /**