Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/i915: Reuse the active golden render state batch

The golden render state is constant, but we recreate the batch setting
it up for every new context. If we keep that batch in a volatile cache
we can safely reuse it whenever we need to initialise a new context. We
mark the pages as purgeable and use the shrinker to recover pages from
the batch whenever we face memory pressues, recreating that batch afresh
on the next new context.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtien@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20161028125858.23563-8-chris@chris-wilson.co.uk

+131 -73
+118 -70
drivers/gpu/drm/i915/i915_gem_render_state.c
··· 28 28 #include "i915_drv.h" 29 29 #include "intel_renderstate.h" 30 30 31 - struct render_state { 31 + struct intel_render_state { 32 32 const struct intel_renderstate_rodata *rodata; 33 33 struct i915_vma *vma; 34 - u32 aux_batch_size; 35 - u32 aux_batch_offset; 34 + u32 batch_offset; 35 + u32 batch_size; 36 + u32 aux_offset; 37 + u32 aux_size; 36 38 }; 37 39 38 40 static const struct intel_renderstate_rodata * 39 - render_state_get_rodata(const struct drm_i915_gem_request *req) 41 + render_state_get_rodata(const struct intel_engine_cs *engine) 40 42 { 41 - switch (INTEL_GEN(req->i915)) { 43 + switch (INTEL_GEN(engine->i915)) { 42 44 case 6: 43 45 return &gen6_null_state; 44 46 case 7: ··· 65 63 */ 66 64 #define OUT_BATCH(batch, i, val) \ 67 65 do { \ 68 - if (WARN_ON((i) >= PAGE_SIZE / sizeof(u32))) { \ 69 - ret = -ENOSPC; \ 70 - goto err_out; \ 71 - } \ 66 + if ((i) >= PAGE_SIZE / sizeof(u32)) \ 67 + goto err; \ 72 68 (batch)[(i)++] = (val); \ 73 69 } while(0) 74 70 75 - static int render_state_setup(struct render_state *so) 71 + static int render_state_setup(struct intel_render_state *so, 72 + struct drm_i915_private *i915) 76 73 { 77 - struct drm_i915_private *dev_priv = to_i915(so->vma->vm->dev); 78 74 const struct intel_renderstate_rodata *rodata = so->rodata; 79 - const bool has_64bit_reloc = INTEL_GEN(dev_priv) >= 8; 75 + const bool has_64bit_reloc = INTEL_GEN(i915) >= 8; 76 + struct drm_i915_gem_object *obj = so->vma->obj; 80 77 unsigned int i = 0, reloc_index = 0; 81 - struct page *page; 78 + unsigned int needs_clflush; 82 79 u32 *d; 83 80 int ret; 84 81 85 - ret = i915_gem_object_set_to_cpu_domain(so->vma->obj, true); 82 + ret = i915_gem_obj_prepare_shmem_write(obj, &needs_clflush); 86 83 if (ret) 87 84 return ret; 88 85 89 - page = i915_gem_object_get_dirty_page(so->vma->obj, 0); 90 - d = kmap(page); 86 + d = kmap_atomic(i915_gem_object_get_dirty_page(obj, 0)); 91 87 92 88 while (i < rodata->batch_items) { 93 89 u32 s = rodata->batch[i]; ··· 95 95 s = lower_32_bits(r); 96 96 if (has_64bit_reloc) { 97 97 if (i + 1 >= rodata->batch_items || 98 - rodata->batch[i + 1] != 0) { 99 - ret = -EINVAL; 100 - goto err_out; 101 - } 98 + rodata->batch[i + 1] != 0) 99 + goto err; 102 100 103 101 d[i++] = s; 104 102 s = upper_32_bits(r); ··· 108 110 d[i++] = s; 109 111 } 110 112 113 + if (rodata->reloc[reloc_index] != -1) { 114 + DRM_ERROR("only %d relocs resolved\n", reloc_index); 115 + goto err; 116 + } 117 + 118 + so->batch_offset = so->vma->node.start; 119 + so->batch_size = rodata->batch_items * sizeof(u32); 120 + 111 121 while (i % CACHELINE_DWORDS) 112 122 OUT_BATCH(d, i, MI_NOOP); 113 123 114 - so->aux_batch_offset = i * sizeof(u32); 124 + so->aux_offset = i * sizeof(u32); 115 125 116 - if (HAS_POOLED_EU(dev_priv)) { 126 + if (HAS_POOLED_EU(i915)) { 117 127 /* 118 128 * We always program 3x6 pool config but depending upon which 119 129 * subslice is disabled HW drops down to appropriate config ··· 149 143 } 150 144 151 145 OUT_BATCH(d, i, MI_BATCH_BUFFER_END); 152 - so->aux_batch_size = (i * sizeof(u32)) - so->aux_batch_offset; 153 - 146 + so->aux_size = i * sizeof(u32) - so->aux_offset; 147 + so->aux_offset += so->batch_offset; 154 148 /* 155 149 * Since we are sending length, we need to strictly conform to 156 150 * all requirements. For Gen2 this must be a multiple of 8. 157 151 */ 158 - so->aux_batch_size = ALIGN(so->aux_batch_size, 8); 152 + so->aux_size = ALIGN(so->aux_size, 8); 159 153 160 - kunmap(page); 154 + if (needs_clflush) 155 + drm_clflush_virt_range(d, i * sizeof(u32)); 156 + kunmap_atomic(d); 161 157 162 - ret = i915_gem_object_set_to_gtt_domain(so->vma->obj, false); 163 - if (ret) 164 - return ret; 165 - 166 - if (rodata->reloc[reloc_index] != -1) { 167 - DRM_ERROR("only %d relocs resolved\n", reloc_index); 168 - return -EINVAL; 169 - } 170 - 171 - return 0; 172 - 173 - err_out: 174 - kunmap(page); 158 + ret = i915_gem_object_set_to_gtt_domain(obj, false); 159 + out: 160 + i915_gem_obj_finish_shmem_access(obj); 175 161 return ret; 162 + 163 + err: 164 + kunmap_atomic(d); 165 + ret = -EINVAL; 166 + goto out; 176 167 } 177 168 178 169 #undef OUT_BATCH 179 170 180 - int i915_gem_render_state_init(struct drm_i915_gem_request *req) 171 + int i915_gem_render_state_init(struct intel_engine_cs *engine) 181 172 { 182 - struct render_state so; 173 + struct intel_render_state *so; 174 + const struct intel_renderstate_rodata *rodata; 183 175 struct drm_i915_gem_object *obj; 184 176 int ret; 185 177 186 - if (WARN_ON(req->engine->id != RCS)) 187 - return -ENOENT; 188 - 189 - so.rodata = render_state_get_rodata(req); 190 - if (!so.rodata) 178 + if (engine->id != RCS) 191 179 return 0; 192 180 193 - if (so.rodata->batch_items * 4 > 4096) 181 + rodata = render_state_get_rodata(engine); 182 + if (!rodata) 183 + return 0; 184 + 185 + if (rodata->batch_items * 4 > 4096) 194 186 return -EINVAL; 195 187 196 - obj = i915_gem_object_create_internal(req->i915, 4096); 197 - if (IS_ERR(obj)) 198 - return PTR_ERR(obj); 188 + so = kmalloc(sizeof(*so), GFP_KERNEL); 189 + if (!so) 190 + return -ENOMEM; 199 191 200 - so.vma = i915_vma_create(obj, &req->i915->ggtt.base, NULL); 201 - if (IS_ERR(so.vma)) { 202 - ret = PTR_ERR(so.vma); 192 + obj = i915_gem_object_create_internal(engine->i915, 4096); 193 + if (IS_ERR(obj)) { 194 + ret = PTR_ERR(obj); 195 + goto err_free; 196 + } 197 + 198 + so->vma = i915_vma_create(obj, &engine->i915->ggtt.base, NULL); 199 + if (IS_ERR(so->vma)) { 200 + ret = PTR_ERR(so->vma); 203 201 goto err_obj; 204 202 } 205 203 206 - ret = i915_vma_pin(so.vma, 0, 0, PIN_GLOBAL); 207 - if (ret) 208 - goto err_obj; 204 + so->rodata = rodata; 205 + engine->render_state = so; 206 + return 0; 209 207 210 - ret = render_state_setup(&so); 211 - if (ret) 212 - goto err_unpin; 208 + err_obj: 209 + i915_gem_object_put(obj); 210 + err_free: 211 + kfree(so); 212 + return ret; 213 + } 213 214 214 - ret = req->engine->emit_bb_start(req, so.vma->node.start, 215 - so.rodata->batch_items * 4, 215 + int i915_gem_render_state_emit(struct drm_i915_gem_request *req) 216 + { 217 + struct intel_render_state *so; 218 + int ret; 219 + 220 + so = req->engine->render_state; 221 + if (!so) 222 + return 0; 223 + 224 + /* Recreate the page after shrinking */ 225 + if (!so->vma->obj->pages) 226 + so->batch_offset = -1; 227 + 228 + ret = i915_vma_pin(so->vma, 0, 0, PIN_GLOBAL | PIN_HIGH); 229 + if (ret) 230 + return ret; 231 + 232 + if (so->vma->node.start != so->batch_offset) { 233 + ret = render_state_setup(so, req->i915); 234 + if (ret) 235 + goto err_unpin; 236 + } 237 + 238 + ret = req->engine->emit_bb_start(req, 239 + so->batch_offset, so->batch_size, 216 240 I915_DISPATCH_SECURE); 217 241 if (ret) 218 242 goto err_unpin; 219 243 220 - if (so.aux_batch_size > 8) { 244 + if (so->aux_size > 8) { 221 245 ret = req->engine->emit_bb_start(req, 222 - (so.vma->node.start + 223 - so.aux_batch_offset), 224 - so.aux_batch_size, 246 + so->aux_offset, so->aux_size, 225 247 I915_DISPATCH_SECURE); 226 248 if (ret) 227 249 goto err_unpin; 228 250 } 229 251 230 - i915_vma_move_to_active(so.vma, req, 0); 252 + i915_vma_move_to_active(so->vma, req, 0); 231 253 err_unpin: 232 - i915_vma_unpin(so.vma); 233 - i915_vma_close(so.vma); 234 - err_obj: 235 - __i915_gem_object_release_unless_active(obj); 254 + i915_vma_unpin(so->vma); 236 255 return ret; 256 + } 257 + 258 + void i915_gem_render_state_fini(struct intel_engine_cs *engine) 259 + { 260 + struct intel_render_state *so; 261 + struct drm_i915_gem_object *obj; 262 + 263 + so = fetch_and_zero(&engine->render_state); 264 + if (!so) 265 + return; 266 + 267 + obj = so->vma->obj; 268 + 269 + i915_vma_close(so->vma); 270 + __i915_gem_object_release_unless_active(obj); 271 + 272 + kfree(so); 237 273 }
+3 -1
drivers/gpu/drm/i915/i915_gem_render_state.h
··· 26 26 27 27 struct drm_i915_gem_request; 28 28 29 - int i915_gem_render_state_init(struct drm_i915_gem_request *req); 29 + int i915_gem_render_state_init(struct intel_engine_cs *engine); 30 + int i915_gem_render_state_emit(struct drm_i915_gem_request *req); 31 + void i915_gem_render_state_fini(struct intel_engine_cs *engine); 30 32 31 33 #endif /* _I915_GEM_RENDER_STATE_H_ */
+5
drivers/gpu/drm/i915/intel_engine_cs.c
··· 314 314 if (ret) 315 315 return ret; 316 316 317 + ret = i915_gem_render_state_init(engine); 318 + if (ret) 319 + return ret; 320 + 317 321 return 0; 318 322 } 319 323 ··· 332 328 { 333 329 intel_engine_cleanup_scratch(engine); 334 330 331 + i915_gem_render_state_fini(engine); 335 332 intel_engine_fini_breadcrumbs(engine); 336 333 intel_engine_cleanup_cmd_parser(engine); 337 334 i915_gem_batch_pool_fini(&engine->batch_pool);
+1 -1
drivers/gpu/drm/i915/intel_lrc.c
··· 1637 1637 if (ret) 1638 1638 DRM_ERROR("MOCS failed to program: expect performance issues.\n"); 1639 1639 1640 - return i915_gem_render_state_init(req); 1640 + return i915_gem_render_state_emit(req); 1641 1641 } 1642 1642 1643 1643 /**
+1 -1
drivers/gpu/drm/i915/intel_ringbuffer.c
··· 648 648 if (ret != 0) 649 649 return ret; 650 650 651 - ret = i915_gem_render_state_init(req); 651 + ret = i915_gem_render_state_emit(req); 652 652 if (ret) 653 653 return ret; 654 654
+3
drivers/gpu/drm/i915/intel_ringbuffer.h
··· 157 157 }; 158 158 159 159 struct drm_i915_gem_request; 160 + struct intel_render_state; 160 161 161 162 struct intel_engine_cs { 162 163 struct drm_i915_private *i915; ··· 184 183 u32 mmio_base; 185 184 unsigned int irq_shift; 186 185 struct intel_ring *buffer; 186 + 187 + struct intel_render_state *render_state; 187 188 188 189 /* Rather than have every client wait upon all user interrupts, 189 190 * with the herd waking after every interrupt and each doing the