Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'gvt-fixes-2018-03-15' of https://github.com/intel/gvt-linux into drm-intel-fixes

gvt-fixes-2018-03-15

- Two warnings fix for runtime pm and usr copy (Xiong, Zhenyu)
- OA context fix for vGPU profiling (Min)
- privilege batch buffer reloc fix (Fred)

Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180315100023.5n5a74afky6qinoh@zhen-hp.sh.intel.com

+82 -4
+8
drivers/gpu/drm/i915/gvt/cmd_parser.c
··· 471 471 * used when ret from 2nd level batch buffer 472 472 */ 473 473 int saved_buf_addr_type; 474 + bool is_ctx_wa; 474 475 475 476 struct cmd_info *info; 476 477 ··· 1716 1715 bb->accessing = true; 1717 1716 bb->bb_start_cmd_va = s->ip_va; 1718 1717 1718 + if ((s->buf_type == BATCH_BUFFER_INSTRUCTION) && (!s->is_ctx_wa)) 1719 + bb->bb_offset = s->ip_va - s->rb_va; 1720 + else 1721 + bb->bb_offset = 0; 1722 + 1719 1723 /* 1720 1724 * ip_va saves the virtual address of the shadow batch buffer, while 1721 1725 * ip_gma saves the graphics address of the original batch buffer. ··· 2577 2571 s.ring_tail = gma_tail; 2578 2572 s.rb_va = workload->shadow_ring_buffer_va; 2579 2573 s.workload = workload; 2574 + s.is_ctx_wa = false; 2580 2575 2581 2576 if ((bypass_scan_mask & (1 << workload->ring_id)) || 2582 2577 gma_head == gma_tail) ··· 2631 2624 s.ring_tail = gma_tail; 2632 2625 s.rb_va = wa_ctx->indirect_ctx.shadow_va; 2633 2626 s.workload = workload; 2627 + s.is_ctx_wa = true; 2634 2628 2635 2629 if (!intel_gvt_ggtt_validate_range(s.vgpu, s.ring_start, s.ring_size)) { 2636 2630 ret = -EINVAL;
+2
drivers/gpu/drm/i915/gvt/mmio_context.c
··· 394 394 * performace for batch mmio read/write, so we need 395 395 * handle forcewake mannually. 396 396 */ 397 + intel_runtime_pm_get(dev_priv); 397 398 intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL); 398 399 switch_mmio(pre, next, ring_id); 399 400 intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL); 401 + intel_runtime_pm_put(dev_priv); 400 402 } 401 403 402 404 /**
+67 -4
drivers/gpu/drm/i915/gvt/scheduler.c
··· 52 52 pdp_pair[i].val = pdp[7 - i]; 53 53 } 54 54 55 + /* 56 + * when populating shadow ctx from guest, we should not overrride oa related 57 + * registers, so that they will not be overlapped by guest oa configs. Thus 58 + * made it possible to capture oa data from host for both host and guests. 59 + */ 60 + static void sr_oa_regs(struct intel_vgpu_workload *workload, 61 + u32 *reg_state, bool save) 62 + { 63 + struct drm_i915_private *dev_priv = workload->vgpu->gvt->dev_priv; 64 + u32 ctx_oactxctrl = dev_priv->perf.oa.ctx_oactxctrl_offset; 65 + u32 ctx_flexeu0 = dev_priv->perf.oa.ctx_flexeu0_offset; 66 + int i = 0; 67 + u32 flex_mmio[] = { 68 + i915_mmio_reg_offset(EU_PERF_CNTL0), 69 + i915_mmio_reg_offset(EU_PERF_CNTL1), 70 + i915_mmio_reg_offset(EU_PERF_CNTL2), 71 + i915_mmio_reg_offset(EU_PERF_CNTL3), 72 + i915_mmio_reg_offset(EU_PERF_CNTL4), 73 + i915_mmio_reg_offset(EU_PERF_CNTL5), 74 + i915_mmio_reg_offset(EU_PERF_CNTL6), 75 + }; 76 + 77 + if (!workload || !reg_state || workload->ring_id != RCS) 78 + return; 79 + 80 + if (save) { 81 + workload->oactxctrl = reg_state[ctx_oactxctrl + 1]; 82 + 83 + for (i = 0; i < ARRAY_SIZE(workload->flex_mmio); i++) { 84 + u32 state_offset = ctx_flexeu0 + i * 2; 85 + 86 + workload->flex_mmio[i] = reg_state[state_offset + 1]; 87 + } 88 + } else { 89 + reg_state[ctx_oactxctrl] = 90 + i915_mmio_reg_offset(GEN8_OACTXCONTROL); 91 + reg_state[ctx_oactxctrl + 1] = workload->oactxctrl; 92 + 93 + for (i = 0; i < ARRAY_SIZE(workload->flex_mmio); i++) { 94 + u32 state_offset = ctx_flexeu0 + i * 2; 95 + u32 mmio = flex_mmio[i]; 96 + 97 + reg_state[state_offset] = mmio; 98 + reg_state[state_offset + 1] = workload->flex_mmio[i]; 99 + } 100 + } 101 + } 102 + 55 103 static int populate_shadow_context(struct intel_vgpu_workload *workload) 56 104 { 57 105 struct intel_vgpu *vgpu = workload->vgpu; ··· 146 98 page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN); 147 99 shadow_ring_context = kmap(page); 148 100 101 + sr_oa_regs(workload, (u32 *)shadow_ring_context, true); 149 102 #define COPY_REG(name) \ 150 103 intel_gvt_hypervisor_read_gpa(vgpu, workload->ring_context_gpa \ 151 104 + RING_CTX_OFF(name.val), &shadow_ring_context->name.val, 4) ··· 171 122 sizeof(*shadow_ring_context), 172 123 I915_GTT_PAGE_SIZE - sizeof(*shadow_ring_context)); 173 124 125 + sr_oa_regs(workload, (u32 *)shadow_ring_context, false); 174 126 kunmap(page); 175 127 return 0; 176 128 } ··· 425 375 ret = PTR_ERR(bb->vma); 426 376 goto err; 427 377 } 378 + 379 + /* For privilge batch buffer and not wa_ctx, the bb_start_cmd_va 380 + * is only updated into ring_scan_buffer, not real ring address 381 + * allocated in later copy_workload_to_ring_buffer. pls be noted 382 + * shadow_ring_buffer_va is now pointed to real ring buffer va 383 + * in copy_workload_to_ring_buffer. 384 + */ 385 + 386 + if (bb->bb_offset) 387 + bb->bb_start_cmd_va = workload->shadow_ring_buffer_va 388 + + bb->bb_offset; 428 389 429 390 /* relocate shadow batch buffer */ 430 391 bb->bb_start_cmd_va[1] = i915_ggtt_offset(bb->vma); ··· 1105 1044 1106 1045 bitmap_zero(s->shadow_ctx_desc_updated, I915_NUM_ENGINES); 1107 1046 1108 - s->workloads = kmem_cache_create("gvt-g_vgpu_workload", 1109 - sizeof(struct intel_vgpu_workload), 0, 1110 - SLAB_HWCACHE_ALIGN, 1111 - NULL); 1047 + s->workloads = kmem_cache_create_usercopy("gvt-g_vgpu_workload", 1048 + sizeof(struct intel_vgpu_workload), 0, 1049 + SLAB_HWCACHE_ALIGN, 1050 + offsetof(struct intel_vgpu_workload, rb_tail), 1051 + sizeof_field(struct intel_vgpu_workload, rb_tail), 1052 + NULL); 1112 1053 1113 1054 if (!s->workloads) { 1114 1055 ret = -ENOMEM;
+5
drivers/gpu/drm/i915/gvt/scheduler.h
··· 110 110 /* shadow batch buffer */ 111 111 struct list_head shadow_bb; 112 112 struct intel_shadow_wa_ctx wa_ctx; 113 + 114 + /* oa registers */ 115 + u32 oactxctrl; 116 + u32 flex_mmio[7]; 113 117 }; 114 118 115 119 struct intel_vgpu_shadow_bb { ··· 124 120 u32 *bb_start_cmd_va; 125 121 unsigned int clflush; 126 122 bool accessing; 123 + unsigned long bb_offset; 127 124 }; 128 125 129 126 #define workload_q_head(vgpu, ring_id) \