drivers/gpu/drm/i915/intel_ringbuffer.c at v5.2 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / drivers / gpu / drm / i915 / intel_ringbuffer.c
at v5.2 61 kB view raw
   1/*
   2 * Copyright © 2008-2010 Intel Corporation
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * Authors:
  24 *    Eric Anholt <eric@anholt.net>
  25 *    Zou Nan hai <nanhai.zou@intel.com>
  26 *    Xiang Hai hao<haihao.xiang@intel.com>
  27 *
  28 */
  29
  30#include <linux/log2.h>
  31
  32#include <drm/i915_drm.h>
  33
  34#include "i915_drv.h"
  35#include "i915_gem_render_state.h"
  36#include "i915_reset.h"
  37#include "i915_trace.h"
  38#include "intel_drv.h"
  39#include "intel_workarounds.h"
  40
  41/* Rough estimate of the typical request size, performing a flush,
  42 * set-context and then emitting the batch.
  43 */
  44#define LEGACY_REQUEST_SIZE 200
  45
  46unsigned int intel_ring_update_space(struct intel_ring *ring)
  47{
  48	unsigned int space;
  49
  50	space = __intel_ring_space(ring->head, ring->emit, ring->size);
  51
  52	ring->space = space;
  53	return space;
  54}
  55
  56static int
  57gen2_render_ring_flush(struct i915_request *rq, u32 mode)
  58{
  59	unsigned int num_store_dw;
  60	u32 cmd, *cs;
  61
  62	cmd = MI_FLUSH;
  63	num_store_dw = 0;
  64	if (mode & EMIT_INVALIDATE)
  65		cmd |= MI_READ_FLUSH;
  66	if (mode & EMIT_FLUSH)
  67		num_store_dw = 4;
  68
  69	cs = intel_ring_begin(rq, 2 + 3 * num_store_dw);
  70	if (IS_ERR(cs))
  71		return PTR_ERR(cs);
  72
  73	*cs++ = cmd;
  74	while (num_store_dw--) {
  75		*cs++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
  76		*cs++ = i915_scratch_offset(rq->i915);
  77		*cs++ = 0;
  78	}
  79	*cs++ = MI_FLUSH | MI_NO_WRITE_FLUSH;
  80
  81	intel_ring_advance(rq, cs);
  82
  83	return 0;
  84}
  85
  86static int
  87gen4_render_ring_flush(struct i915_request *rq, u32 mode)
  88{
  89	u32 cmd, *cs;
  90	int i;
  91
  92	/*
  93	 * read/write caches:
  94	 *
  95	 * I915_GEM_DOMAIN_RENDER is always invalidated, but is
  96	 * only flushed if MI_NO_WRITE_FLUSH is unset.  On 965, it is
  97	 * also flushed at 2d versus 3d pipeline switches.
  98	 *
  99	 * read-only caches:
 100	 *
 101	 * I915_GEM_DOMAIN_SAMPLER is flushed on pre-965 if
 102	 * MI_READ_FLUSH is set, and is always flushed on 965.
 103	 *
 104	 * I915_GEM_DOMAIN_COMMAND may not exist?
 105	 *
 106	 * I915_GEM_DOMAIN_INSTRUCTION, which exists on 965, is
 107	 * invalidated when MI_EXE_FLUSH is set.
 108	 *
 109	 * I915_GEM_DOMAIN_VERTEX, which exists on 965, is
 110	 * invalidated with every MI_FLUSH.
 111	 *
 112	 * TLBs:
 113	 *
 114	 * On 965, TLBs associated with I915_GEM_DOMAIN_COMMAND
 115	 * and I915_GEM_DOMAIN_CPU in are invalidated at PTE write and
 116	 * I915_GEM_DOMAIN_RENDER and I915_GEM_DOMAIN_SAMPLER
 117	 * are flushed at any MI_FLUSH.
 118	 */
 119
 120	cmd = MI_FLUSH;
 121	if (mode & EMIT_INVALIDATE) {
 122		cmd |= MI_EXE_FLUSH;
 123		if (IS_G4X(rq->i915) || IS_GEN(rq->i915, 5))
 124			cmd |= MI_INVALIDATE_ISP;
 125	}
 126
 127	i = 2;
 128	if (mode & EMIT_INVALIDATE)
 129		i += 20;
 130
 131	cs = intel_ring_begin(rq, i);
 132	if (IS_ERR(cs))
 133		return PTR_ERR(cs);
 134
 135	*cs++ = cmd;
 136
 137	/*
 138	 * A random delay to let the CS invalidate take effect? Without this
 139	 * delay, the GPU relocation path fails as the CS does not see
 140	 * the updated contents. Just as important, if we apply the flushes
 141	 * to the EMIT_FLUSH branch (i.e. immediately after the relocation
 142	 * write and before the invalidate on the next batch), the relocations
 143	 * still fail. This implies that is a delay following invalidation
 144	 * that is required to reset the caches as opposed to a delay to
 145	 * ensure the memory is written.
 146	 */
 147	if (mode & EMIT_INVALIDATE) {
 148		*cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE;
 149		*cs++ = i915_scratch_offset(rq->i915) | PIPE_CONTROL_GLOBAL_GTT;
 150		*cs++ = 0;
 151		*cs++ = 0;
 152
 153		for (i = 0; i < 12; i++)
 154			*cs++ = MI_FLUSH;
 155
 156		*cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE;
 157		*cs++ = i915_scratch_offset(rq->i915) | PIPE_CONTROL_GLOBAL_GTT;
 158		*cs++ = 0;
 159		*cs++ = 0;
 160	}
 161
 162	*cs++ = cmd;
 163
 164	intel_ring_advance(rq, cs);
 165
 166	return 0;
 167}
 168
 169/*
 170 * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
 171 * implementing two workarounds on gen6.  From section 1.4.7.1
 172 * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
 173 *
 174 * [DevSNB-C+{W/A}] Before any depth stall flush (including those
 175 * produced by non-pipelined state commands), software needs to first
 176 * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
 177 * 0.
 178 *
 179 * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
 180 * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
 181 *
 182 * And the workaround for these two requires this workaround first:
 183 *
 184 * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
 185 * BEFORE the pipe-control with a post-sync op and no write-cache
 186 * flushes.
 187 *
 188 * And this last workaround is tricky because of the requirements on
 189 * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
 190 * volume 2 part 1:
 191 *
 192 *     "1 of the following must also be set:
 193 *      - Render Target Cache Flush Enable ([12] of DW1)
 194 *      - Depth Cache Flush Enable ([0] of DW1)
 195 *      - Stall at Pixel Scoreboard ([1] of DW1)
 196 *      - Depth Stall ([13] of DW1)
 197 *      - Post-Sync Operation ([13] of DW1)
 198 *      - Notify Enable ([8] of DW1)"
 199 *
 200 * The cache flushes require the workaround flush that triggered this
 201 * one, so we can't use it.  Depth stall would trigger the same.
 202 * Post-sync nonzero is what triggered this second workaround, so we
 203 * can't use that one either.  Notify enable is IRQs, which aren't
 204 * really our business.  That leaves only stall at scoreboard.
 205 */
 206static int
 207gen6_emit_post_sync_nonzero_flush(struct i915_request *rq)
 208{
 209	u32 scratch_addr = i915_scratch_offset(rq->i915) + 2 * CACHELINE_BYTES;
 210	u32 *cs;
 211
 212	cs = intel_ring_begin(rq, 6);
 213	if (IS_ERR(cs))
 214		return PTR_ERR(cs);
 215
 216	*cs++ = GFX_OP_PIPE_CONTROL(5);
 217	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
 218	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
 219	*cs++ = 0; /* low dword */
 220	*cs++ = 0; /* high dword */
 221	*cs++ = MI_NOOP;
 222	intel_ring_advance(rq, cs);
 223
 224	cs = intel_ring_begin(rq, 6);
 225	if (IS_ERR(cs))
 226		return PTR_ERR(cs);
 227
 228	*cs++ = GFX_OP_PIPE_CONTROL(5);
 229	*cs++ = PIPE_CONTROL_QW_WRITE;
 230	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
 231	*cs++ = 0;
 232	*cs++ = 0;
 233	*cs++ = MI_NOOP;
 234	intel_ring_advance(rq, cs);
 235
 236	return 0;
 237}
 238
 239static int
 240gen6_render_ring_flush(struct i915_request *rq, u32 mode)
 241{
 242	u32 scratch_addr = i915_scratch_offset(rq->i915) + 2 * CACHELINE_BYTES;
 243	u32 *cs, flags = 0;
 244	int ret;
 245
 246	/* Force SNB workarounds for PIPE_CONTROL flushes */
 247	ret = gen6_emit_post_sync_nonzero_flush(rq);
 248	if (ret)
 249		return ret;
 250
 251	/* Just flush everything.  Experiments have shown that reducing the
 252	 * number of bits based on the write domains has little performance
 253	 * impact.
 254	 */
 255	if (mode & EMIT_FLUSH) {
 256		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
 257		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
 258		/*
 259		 * Ensure that any following seqno writes only happen
 260		 * when the render cache is indeed flushed.
 261		 */
 262		flags |= PIPE_CONTROL_CS_STALL;
 263	}
 264	if (mode & EMIT_INVALIDATE) {
 265		flags |= PIPE_CONTROL_TLB_INVALIDATE;
 266		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
 267		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 268		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 269		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
 270		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
 271		/*
 272		 * TLB invalidate requires a post-sync write.
 273		 */
 274		flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
 275	}
 276
 277	cs = intel_ring_begin(rq, 4);
 278	if (IS_ERR(cs))
 279		return PTR_ERR(cs);
 280
 281	*cs++ = GFX_OP_PIPE_CONTROL(4);
 282	*cs++ = flags;
 283	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
 284	*cs++ = 0;
 285	intel_ring_advance(rq, cs);
 286
 287	return 0;
 288}
 289
 290static u32 *gen6_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 291{
 292	/* First we do the gen6_emit_post_sync_nonzero_flush w/a */
 293	*cs++ = GFX_OP_PIPE_CONTROL(4);
 294	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
 295	*cs++ = 0;
 296	*cs++ = 0;
 297
 298	*cs++ = GFX_OP_PIPE_CONTROL(4);
 299	*cs++ = PIPE_CONTROL_QW_WRITE;
 300	*cs++ = i915_scratch_offset(rq->i915) | PIPE_CONTROL_GLOBAL_GTT;
 301	*cs++ = 0;
 302
 303	/* Finally we can flush and with it emit the breadcrumb */
 304	*cs++ = GFX_OP_PIPE_CONTROL(4);
 305	*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
 306		 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
 307		 PIPE_CONTROL_DC_FLUSH_ENABLE |
 308		 PIPE_CONTROL_QW_WRITE |
 309		 PIPE_CONTROL_CS_STALL);
 310	*cs++ = rq->timeline->hwsp_offset | PIPE_CONTROL_GLOBAL_GTT;
 311	*cs++ = rq->fence.seqno;
 312
 313	*cs++ = GFX_OP_PIPE_CONTROL(4);
 314	*cs++ = PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_STORE_DATA_INDEX;
 315	*cs++ = I915_GEM_HWS_HANGCHECK_ADDR | PIPE_CONTROL_GLOBAL_GTT;
 316	*cs++ = intel_engine_next_hangcheck_seqno(rq->engine);
 317
 318	*cs++ = MI_USER_INTERRUPT;
 319	*cs++ = MI_NOOP;
 320
 321	rq->tail = intel_ring_offset(rq, cs);
 322	assert_ring_tail_valid(rq->ring, rq->tail);
 323
 324	return cs;
 325}
 326
 327static int
 328gen7_render_ring_cs_stall_wa(struct i915_request *rq)
 329{
 330	u32 *cs;
 331
 332	cs = intel_ring_begin(rq, 4);
 333	if (IS_ERR(cs))
 334		return PTR_ERR(cs);
 335
 336	*cs++ = GFX_OP_PIPE_CONTROL(4);
 337	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
 338	*cs++ = 0;
 339	*cs++ = 0;
 340	intel_ring_advance(rq, cs);
 341
 342	return 0;
 343}
 344
 345static int
 346gen7_render_ring_flush(struct i915_request *rq, u32 mode)
 347{
 348	u32 scratch_addr = i915_scratch_offset(rq->i915) + 2 * CACHELINE_BYTES;
 349	u32 *cs, flags = 0;
 350
 351	/*
 352	 * Ensure that any following seqno writes only happen when the render
 353	 * cache is indeed flushed.
 354	 *
 355	 * Workaround: 4th PIPE_CONTROL command (except the ones with only
 356	 * read-cache invalidate bits set) must have the CS_STALL bit set. We
 357	 * don't try to be clever and just set it unconditionally.
 358	 */
 359	flags |= PIPE_CONTROL_CS_STALL;
 360
 361	/* Just flush everything.  Experiments have shown that reducing the
 362	 * number of bits based on the write domains has little performance
 363	 * impact.
 364	 */
 365	if (mode & EMIT_FLUSH) {
 366		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
 367		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
 368		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
 369		flags |= PIPE_CONTROL_FLUSH_ENABLE;
 370	}
 371	if (mode & EMIT_INVALIDATE) {
 372		flags |= PIPE_CONTROL_TLB_INVALIDATE;
 373		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
 374		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 375		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 376		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
 377		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
 378		flags |= PIPE_CONTROL_MEDIA_STATE_CLEAR;
 379		/*
 380		 * TLB invalidate requires a post-sync write.
 381		 */
 382		flags |= PIPE_CONTROL_QW_WRITE;
 383		flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
 384
 385		flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
 386
 387		/* Workaround: we must issue a pipe_control with CS-stall bit
 388		 * set before a pipe_control command that has the state cache
 389		 * invalidate bit set. */
 390		gen7_render_ring_cs_stall_wa(rq);
 391	}
 392
 393	cs = intel_ring_begin(rq, 4);
 394	if (IS_ERR(cs))
 395		return PTR_ERR(cs);
 396
 397	*cs++ = GFX_OP_PIPE_CONTROL(4);
 398	*cs++ = flags;
 399	*cs++ = scratch_addr;
 400	*cs++ = 0;
 401	intel_ring_advance(rq, cs);
 402
 403	return 0;
 404}
 405
 406static u32 *gen7_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 407{
 408	*cs++ = GFX_OP_PIPE_CONTROL(4);
 409	*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
 410		 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
 411		 PIPE_CONTROL_DC_FLUSH_ENABLE |
 412		 PIPE_CONTROL_FLUSH_ENABLE |
 413		 PIPE_CONTROL_QW_WRITE |
 414		 PIPE_CONTROL_GLOBAL_GTT_IVB |
 415		 PIPE_CONTROL_CS_STALL);
 416	*cs++ = rq->timeline->hwsp_offset;
 417	*cs++ = rq->fence.seqno;
 418
 419	*cs++ = GFX_OP_PIPE_CONTROL(4);
 420	*cs++ = (PIPE_CONTROL_QW_WRITE |
 421		 PIPE_CONTROL_STORE_DATA_INDEX |
 422		 PIPE_CONTROL_GLOBAL_GTT_IVB);
 423	*cs++ = I915_GEM_HWS_HANGCHECK_ADDR;
 424	*cs++ = intel_engine_next_hangcheck_seqno(rq->engine);
 425
 426	*cs++ = MI_USER_INTERRUPT;
 427	*cs++ = MI_NOOP;
 428
 429	rq->tail = intel_ring_offset(rq, cs);
 430	assert_ring_tail_valid(rq->ring, rq->tail);
 431
 432	return cs;
 433}
 434
 435static u32 *gen6_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 436{
 437	GEM_BUG_ON(rq->timeline->hwsp_ggtt != rq->engine->status_page.vma);
 438	GEM_BUG_ON(offset_in_page(rq->timeline->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
 439
 440	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
 441	*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
 442	*cs++ = rq->fence.seqno;
 443
 444	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
 445	*cs++ = I915_GEM_HWS_HANGCHECK_ADDR | MI_FLUSH_DW_USE_GTT;
 446	*cs++ = intel_engine_next_hangcheck_seqno(rq->engine);
 447
 448	*cs++ = MI_USER_INTERRUPT;
 449	*cs++ = MI_NOOP;
 450
 451	rq->tail = intel_ring_offset(rq, cs);
 452	assert_ring_tail_valid(rq->ring, rq->tail);
 453
 454	return cs;
 455}
 456
 457#define GEN7_XCS_WA 32
 458static u32 *gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 459{
 460	int i;
 461
 462	GEM_BUG_ON(rq->timeline->hwsp_ggtt != rq->engine->status_page.vma);
 463	GEM_BUG_ON(offset_in_page(rq->timeline->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
 464
 465	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
 466	*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
 467	*cs++ = rq->fence.seqno;
 468
 469	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
 470	*cs++ = I915_GEM_HWS_HANGCHECK_ADDR | MI_FLUSH_DW_USE_GTT;
 471	*cs++ = intel_engine_next_hangcheck_seqno(rq->engine);
 472
 473	for (i = 0; i < GEN7_XCS_WA; i++) {
 474		*cs++ = MI_STORE_DWORD_INDEX;
 475		*cs++ = I915_GEM_HWS_SEQNO_ADDR;
 476		*cs++ = rq->fence.seqno;
 477	}
 478
 479	*cs++ = MI_FLUSH_DW;
 480	*cs++ = 0;
 481	*cs++ = 0;
 482
 483	*cs++ = MI_USER_INTERRUPT;
 484
 485	rq->tail = intel_ring_offset(rq, cs);
 486	assert_ring_tail_valid(rq->ring, rq->tail);
 487
 488	return cs;
 489}
 490#undef GEN7_XCS_WA
 491
 492static void set_hwstam(struct intel_engine_cs *engine, u32 mask)
 493{
 494	/*
 495	 * Keep the render interrupt unmasked as this papers over
 496	 * lost interrupts following a reset.
 497	 */
 498	if (engine->class == RENDER_CLASS) {
 499		if (INTEL_GEN(engine->i915) >= 6)
 500			mask &= ~BIT(0);
 501		else
 502			mask &= ~I915_USER_INTERRUPT;
 503	}
 504
 505	intel_engine_set_hwsp_writemask(engine, mask);
 506}
 507
 508static void set_hws_pga(struct intel_engine_cs *engine, phys_addr_t phys)
 509{
 510	struct drm_i915_private *dev_priv = engine->i915;
 511	u32 addr;
 512
 513	addr = lower_32_bits(phys);
 514	if (INTEL_GEN(dev_priv) >= 4)
 515		addr |= (phys >> 28) & 0xf0;
 516
 517	I915_WRITE(HWS_PGA, addr);
 518}
 519
 520static struct page *status_page(struct intel_engine_cs *engine)
 521{
 522	struct drm_i915_gem_object *obj = engine->status_page.vma->obj;
 523
 524	GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
 525	return sg_page(obj->mm.pages->sgl);
 526}
 527
 528static void ring_setup_phys_status_page(struct intel_engine_cs *engine)
 529{
 530	set_hws_pga(engine, PFN_PHYS(page_to_pfn(status_page(engine))));
 531	set_hwstam(engine, ~0u);
 532}
 533
 534static void set_hwsp(struct intel_engine_cs *engine, u32 offset)
 535{
 536	struct drm_i915_private *dev_priv = engine->i915;
 537	i915_reg_t hwsp;
 538
 539	/*
 540	 * The ring status page addresses are no longer next to the rest of
 541	 * the ring registers as of gen7.
 542	 */
 543	if (IS_GEN(dev_priv, 7)) {
 544		switch (engine->id) {
 545		/*
 546		 * No more rings exist on Gen7. Default case is only to shut up
 547		 * gcc switch check warning.
 548		 */
 549		default:
 550			GEM_BUG_ON(engine->id);
 551			/* fallthrough */
 552		case RCS0:
 553			hwsp = RENDER_HWS_PGA_GEN7;
 554			break;
 555		case BCS0:
 556			hwsp = BLT_HWS_PGA_GEN7;
 557			break;
 558		case VCS0:
 559			hwsp = BSD_HWS_PGA_GEN7;
 560			break;
 561		case VECS0:
 562			hwsp = VEBOX_HWS_PGA_GEN7;
 563			break;
 564		}
 565	} else if (IS_GEN(dev_priv, 6)) {
 566		hwsp = RING_HWS_PGA_GEN6(engine->mmio_base);
 567	} else {
 568		hwsp = RING_HWS_PGA(engine->mmio_base);
 569	}
 570
 571	I915_WRITE(hwsp, offset);
 572	POSTING_READ(hwsp);
 573}
 574
 575static void flush_cs_tlb(struct intel_engine_cs *engine)
 576{
 577	struct drm_i915_private *dev_priv = engine->i915;
 578
 579	if (!IS_GEN_RANGE(dev_priv, 6, 7))
 580		return;
 581
 582	/* ring should be idle before issuing a sync flush*/
 583	WARN_ON((ENGINE_READ(engine, RING_MI_MODE) & MODE_IDLE) == 0);
 584
 585	ENGINE_WRITE(engine, RING_INSTPM,
 586		     _MASKED_BIT_ENABLE(INSTPM_TLB_INVALIDATE |
 587					INSTPM_SYNC_FLUSH));
 588	if (intel_wait_for_register(engine->uncore,
 589				    RING_INSTPM(engine->mmio_base),
 590				    INSTPM_SYNC_FLUSH, 0,
 591				    1000))
 592		DRM_ERROR("%s: wait for SyncFlush to complete for TLB invalidation timed out\n",
 593			  engine->name);
 594}
 595
 596static void ring_setup_status_page(struct intel_engine_cs *engine)
 597{
 598	set_hwsp(engine, i915_ggtt_offset(engine->status_page.vma));
 599	set_hwstam(engine, ~0u);
 600
 601	flush_cs_tlb(engine);
 602}
 603
 604static bool stop_ring(struct intel_engine_cs *engine)
 605{
 606	struct drm_i915_private *dev_priv = engine->i915;
 607
 608	if (INTEL_GEN(dev_priv) > 2) {
 609		ENGINE_WRITE(engine,
 610			     RING_MI_MODE, _MASKED_BIT_ENABLE(STOP_RING));
 611		if (intel_wait_for_register(engine->uncore,
 612					    RING_MI_MODE(engine->mmio_base),
 613					    MODE_IDLE,
 614					    MODE_IDLE,
 615					    1000)) {
 616			DRM_ERROR("%s : timed out trying to stop ring\n",
 617				  engine->name);
 618
 619			/*
 620			 * Sometimes we observe that the idle flag is not
 621			 * set even though the ring is empty. So double
 622			 * check before giving up.
 623			 */
 624			if (ENGINE_READ(engine, RING_HEAD) !=
 625			    ENGINE_READ(engine, RING_TAIL))
 626				return false;
 627		}
 628	}
 629
 630	ENGINE_WRITE(engine, RING_HEAD, ENGINE_READ(engine, RING_TAIL));
 631
 632	ENGINE_WRITE(engine, RING_HEAD, 0);
 633	ENGINE_WRITE(engine, RING_TAIL, 0);
 634
 635	/* The ring must be empty before it is disabled */
 636	ENGINE_WRITE(engine, RING_CTL, 0);
 637
 638	return (ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR) == 0;
 639}
 640
 641static int init_ring_common(struct intel_engine_cs *engine)
 642{
 643	struct drm_i915_private *dev_priv = engine->i915;
 644	struct intel_ring *ring = engine->buffer;
 645	int ret = 0;
 646
 647	intel_uncore_forcewake_get(engine->uncore, FORCEWAKE_ALL);
 648
 649	if (!stop_ring(engine)) {
 650		/* G45 ring initialization often fails to reset head to zero */
 651		DRM_DEBUG_DRIVER("%s head not reset to zero "
 652				"ctl %08x head %08x tail %08x start %08x\n",
 653				engine->name,
 654				ENGINE_READ(engine, RING_CTL),
 655				ENGINE_READ(engine, RING_HEAD),
 656				ENGINE_READ(engine, RING_TAIL),
 657				ENGINE_READ(engine, RING_START));
 658
 659		if (!stop_ring(engine)) {
 660			DRM_ERROR("failed to set %s head to zero "
 661				  "ctl %08x head %08x tail %08x start %08x\n",
 662				  engine->name,
 663				  ENGINE_READ(engine, RING_CTL),
 664				  ENGINE_READ(engine, RING_HEAD),
 665				  ENGINE_READ(engine, RING_TAIL),
 666				  ENGINE_READ(engine, RING_START));
 667			ret = -EIO;
 668			goto out;
 669		}
 670	}
 671
 672	if (HWS_NEEDS_PHYSICAL(dev_priv))
 673		ring_setup_phys_status_page(engine);
 674	else
 675		ring_setup_status_page(engine);
 676
 677	intel_engine_reset_breadcrumbs(engine);
 678
 679	/* Enforce ordering by reading HEAD register back */
 680	ENGINE_READ(engine, RING_HEAD);
 681
 682	/* Initialize the ring. This must happen _after_ we've cleared the ring
 683	 * registers with the above sequence (the readback of the HEAD registers
 684	 * also enforces ordering), otherwise the hw might lose the new ring
 685	 * register values. */
 686	ENGINE_WRITE(engine, RING_START, i915_ggtt_offset(ring->vma));
 687
 688	/* WaClearRingBufHeadRegAtInit:ctg,elk */
 689	if (ENGINE_READ(engine, RING_HEAD))
 690		DRM_DEBUG_DRIVER("%s initialization failed [head=%08x], fudging\n",
 691				 engine->name, ENGINE_READ(engine, RING_HEAD));
 692
 693	/* Check that the ring offsets point within the ring! */
 694	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->head));
 695	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
 696	intel_ring_update_space(ring);
 697
 698	/* First wake the ring up to an empty/idle ring */
 699	ENGINE_WRITE(engine, RING_HEAD, ring->head);
 700	ENGINE_WRITE(engine, RING_TAIL, ring->head);
 701	ENGINE_POSTING_READ(engine, RING_TAIL);
 702
 703	ENGINE_WRITE(engine, RING_CTL, RING_CTL_SIZE(ring->size) | RING_VALID);
 704
 705	/* If the head is still not zero, the ring is dead */
 706	if (intel_wait_for_register(engine->uncore,
 707				    RING_CTL(engine->mmio_base),
 708				    RING_VALID, RING_VALID,
 709				    50)) {
 710		DRM_ERROR("%s initialization failed "
 711			  "ctl %08x (valid? %d) head %08x [%08x] tail %08x [%08x] start %08x [expected %08x]\n",
 712			  engine->name,
 713			  ENGINE_READ(engine, RING_CTL),
 714			  ENGINE_READ(engine, RING_CTL) & RING_VALID,
 715			  ENGINE_READ(engine, RING_HEAD), ring->head,
 716			  ENGINE_READ(engine, RING_TAIL), ring->tail,
 717			  ENGINE_READ(engine, RING_START),
 718			  i915_ggtt_offset(ring->vma));
 719		ret = -EIO;
 720		goto out;
 721	}
 722
 723	if (INTEL_GEN(dev_priv) > 2)
 724		ENGINE_WRITE(engine,
 725			     RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
 726
 727	/* Now awake, let it get started */
 728	if (ring->tail != ring->head) {
 729		ENGINE_WRITE(engine, RING_TAIL, ring->tail);
 730		ENGINE_POSTING_READ(engine, RING_TAIL);
 731	}
 732
 733	/* Papering over lost _interrupts_ immediately following the restart */
 734	intel_engine_queue_breadcrumbs(engine);
 735out:
 736	intel_uncore_forcewake_put(engine->uncore, FORCEWAKE_ALL);
 737
 738	return ret;
 739}
 740
 741static void reset_prepare(struct intel_engine_cs *engine)
 742{
 743	intel_engine_stop_cs(engine);
 744}
 745
 746static void reset_ring(struct intel_engine_cs *engine, bool stalled)
 747{
 748	struct i915_timeline *tl = &engine->timeline;
 749	struct i915_request *pos, *rq;
 750	unsigned long flags;
 751	u32 head;
 752
 753	rq = NULL;
 754	spin_lock_irqsave(&tl->lock, flags);
 755	list_for_each_entry(pos, &tl->requests, link) {
 756		if (!i915_request_completed(pos)) {
 757			rq = pos;
 758			break;
 759		}
 760	}
 761
 762	/*
 763	 * The guilty request will get skipped on a hung engine.
 764	 *
 765	 * Users of client default contexts do not rely on logical
 766	 * state preserved between batches so it is safe to execute
 767	 * queued requests following the hang. Non default contexts
 768	 * rely on preserved state, so skipping a batch loses the
 769	 * evolution of the state and it needs to be considered corrupted.
 770	 * Executing more queued batches on top of corrupted state is
 771	 * risky. But we take the risk by trying to advance through
 772	 * the queued requests in order to make the client behaviour
 773	 * more predictable around resets, by not throwing away random
 774	 * amount of batches it has prepared for execution. Sophisticated
 775	 * clients can use gem_reset_stats_ioctl and dma fence status
 776	 * (exported via sync_file info ioctl on explicit fences) to observe
 777	 * when it loses the context state and should rebuild accordingly.
 778	 *
 779	 * The context ban, and ultimately the client ban, mechanism are safety
 780	 * valves if client submission ends up resulting in nothing more than
 781	 * subsequent hangs.
 782	 */
 783
 784	if (rq) {
 785		/*
 786		 * Try to restore the logical GPU state to match the
 787		 * continuation of the request queue. If we skip the
 788		 * context/PD restore, then the next request may try to execute
 789		 * assuming that its context is valid and loaded on the GPU and
 790		 * so may try to access invalid memory, prompting repeated GPU
 791		 * hangs.
 792		 *
 793		 * If the request was guilty, we still restore the logical
 794		 * state in case the next request requires it (e.g. the
 795		 * aliasing ppgtt), but skip over the hung batch.
 796		 *
 797		 * If the request was innocent, we try to replay the request
 798		 * with the restored context.
 799		 */
 800		i915_reset_request(rq, stalled);
 801
 802		GEM_BUG_ON(rq->ring != engine->buffer);
 803		head = rq->head;
 804	} else {
 805		head = engine->buffer->tail;
 806	}
 807	engine->buffer->head = intel_ring_wrap(engine->buffer, head);
 808
 809	spin_unlock_irqrestore(&tl->lock, flags);
 810}
 811
 812static void reset_finish(struct intel_engine_cs *engine)
 813{
 814}
 815
 816static int intel_rcs_ctx_init(struct i915_request *rq)
 817{
 818	int ret;
 819
 820	ret = intel_engine_emit_ctx_wa(rq);
 821	if (ret != 0)
 822		return ret;
 823
 824	ret = i915_gem_render_state_emit(rq);
 825	if (ret)
 826		return ret;
 827
 828	return 0;
 829}
 830
 831static int init_render_ring(struct intel_engine_cs *engine)
 832{
 833	struct drm_i915_private *dev_priv = engine->i915;
 834	int ret = init_ring_common(engine);
 835	if (ret)
 836		return ret;
 837
 838	/* WaTimedSingleVertexDispatch:cl,bw,ctg,elk,ilk,snb */
 839	if (IS_GEN_RANGE(dev_priv, 4, 6))
 840		I915_WRITE(MI_MODE, _MASKED_BIT_ENABLE(VS_TIMER_DISPATCH));
 841
 842	/* We need to disable the AsyncFlip performance optimisations in order
 843	 * to use MI_WAIT_FOR_EVENT within the CS. It should already be
 844	 * programmed to '1' on all products.
 845	 *
 846	 * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv
 847	 */
 848	if (IS_GEN_RANGE(dev_priv, 6, 7))
 849		I915_WRITE(MI_MODE, _MASKED_BIT_ENABLE(ASYNC_FLIP_PERF_DISABLE));
 850
 851	/* Required for the hardware to program scanline values for waiting */
 852	/* WaEnableFlushTlbInvalidationMode:snb */
 853	if (IS_GEN(dev_priv, 6))
 854		I915_WRITE(GFX_MODE,
 855			   _MASKED_BIT_ENABLE(GFX_TLB_INVALIDATE_EXPLICIT));
 856
 857	/* WaBCSVCSTlbInvalidationMode:ivb,vlv,hsw */
 858	if (IS_GEN(dev_priv, 7))
 859		I915_WRITE(GFX_MODE_GEN7,
 860			   _MASKED_BIT_ENABLE(GFX_TLB_INVALIDATE_EXPLICIT) |
 861			   _MASKED_BIT_ENABLE(GFX_REPLAY_MODE));
 862
 863	if (IS_GEN(dev_priv, 6)) {
 864		/* From the Sandybridge PRM, volume 1 part 3, page 24:
 865		 * "If this bit is set, STCunit will have LRA as replacement
 866		 *  policy. [...] This bit must be reset.  LRA replacement
 867		 *  policy is not supported."
 868		 */
 869		I915_WRITE(CACHE_MODE_0,
 870			   _MASKED_BIT_DISABLE(CM0_STC_EVICT_DISABLE_LRA_SNB));
 871	}
 872
 873	if (IS_GEN_RANGE(dev_priv, 6, 7))
 874		I915_WRITE(INSTPM, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING));
 875
 876	if (INTEL_GEN(dev_priv) >= 6)
 877		ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
 878
 879	return 0;
 880}
 881
 882static void cancel_requests(struct intel_engine_cs *engine)
 883{
 884	struct i915_request *request;
 885	unsigned long flags;
 886
 887	spin_lock_irqsave(&engine->timeline.lock, flags);
 888
 889	/* Mark all submitted requests as skipped. */
 890	list_for_each_entry(request, &engine->timeline.requests, link) {
 891		if (!i915_request_signaled(request))
 892			dma_fence_set_error(&request->fence, -EIO);
 893
 894		i915_request_mark_complete(request);
 895	}
 896
 897	/* Remaining _unready_ requests will be nop'ed when submitted */
 898
 899	spin_unlock_irqrestore(&engine->timeline.lock, flags);
 900}
 901
 902static void i9xx_submit_request(struct i915_request *request)
 903{
 904	i915_request_submit(request);
 905
 906	ENGINE_WRITE(request->engine, RING_TAIL,
 907		     intel_ring_set_tail(request->ring, request->tail));
 908}
 909
 910static u32 *i9xx_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 911{
 912	GEM_BUG_ON(rq->timeline->hwsp_ggtt != rq->engine->status_page.vma);
 913	GEM_BUG_ON(offset_in_page(rq->timeline->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
 914
 915	*cs++ = MI_FLUSH;
 916
 917	*cs++ = MI_STORE_DWORD_INDEX;
 918	*cs++ = I915_GEM_HWS_SEQNO_ADDR;
 919	*cs++ = rq->fence.seqno;
 920
 921	*cs++ = MI_STORE_DWORD_INDEX;
 922	*cs++ = I915_GEM_HWS_HANGCHECK_ADDR;
 923	*cs++ = intel_engine_next_hangcheck_seqno(rq->engine);
 924
 925	*cs++ = MI_USER_INTERRUPT;
 926
 927	rq->tail = intel_ring_offset(rq, cs);
 928	assert_ring_tail_valid(rq->ring, rq->tail);
 929
 930	return cs;
 931}
 932
 933#define GEN5_WA_STORES 8 /* must be at least 1! */
 934static u32 *gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 935{
 936	int i;
 937
 938	GEM_BUG_ON(rq->timeline->hwsp_ggtt != rq->engine->status_page.vma);
 939	GEM_BUG_ON(offset_in_page(rq->timeline->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
 940
 941	*cs++ = MI_FLUSH;
 942
 943	*cs++ = MI_STORE_DWORD_INDEX;
 944	*cs++ = I915_GEM_HWS_HANGCHECK_ADDR;
 945	*cs++ = intel_engine_next_hangcheck_seqno(rq->engine);
 946
 947	BUILD_BUG_ON(GEN5_WA_STORES < 1);
 948	for (i = 0; i < GEN5_WA_STORES; i++) {
 949		*cs++ = MI_STORE_DWORD_INDEX;
 950		*cs++ = I915_GEM_HWS_SEQNO_ADDR;
 951		*cs++ = rq->fence.seqno;
 952	}
 953
 954	*cs++ = MI_USER_INTERRUPT;
 955	*cs++ = MI_NOOP;
 956
 957	rq->tail = intel_ring_offset(rq, cs);
 958	assert_ring_tail_valid(rq->ring, rq->tail);
 959
 960	return cs;
 961}
 962#undef GEN5_WA_STORES
 963
 964static void
 965gen5_irq_enable(struct intel_engine_cs *engine)
 966{
 967	gen5_enable_gt_irq(engine->i915, engine->irq_enable_mask);
 968}
 969
 970static void
 971gen5_irq_disable(struct intel_engine_cs *engine)
 972{
 973	gen5_disable_gt_irq(engine->i915, engine->irq_enable_mask);
 974}
 975
 976static void
 977i9xx_irq_enable(struct intel_engine_cs *engine)
 978{
 979	engine->i915->irq_mask &= ~engine->irq_enable_mask;
 980	intel_uncore_write(engine->uncore, GEN2_IMR, engine->i915->irq_mask);
 981	intel_uncore_posting_read_fw(engine->uncore, GEN2_IMR);
 982}
 983
 984static void
 985i9xx_irq_disable(struct intel_engine_cs *engine)
 986{
 987	engine->i915->irq_mask |= engine->irq_enable_mask;
 988	intel_uncore_write(engine->uncore, GEN2_IMR, engine->i915->irq_mask);
 989}
 990
 991static void
 992i8xx_irq_enable(struct intel_engine_cs *engine)
 993{
 994	struct drm_i915_private *dev_priv = engine->i915;
 995
 996	dev_priv->irq_mask &= ~engine->irq_enable_mask;
 997	I915_WRITE16(GEN2_IMR, dev_priv->irq_mask);
 998	POSTING_READ16(RING_IMR(engine->mmio_base));
 999}
1000
1001static void
1002i8xx_irq_disable(struct intel_engine_cs *engine)
1003{
1004	struct drm_i915_private *dev_priv = engine->i915;
1005
1006	dev_priv->irq_mask |= engine->irq_enable_mask;
1007	I915_WRITE16(GEN2_IMR, dev_priv->irq_mask);
1008}
1009
1010static int
1011bsd_ring_flush(struct i915_request *rq, u32 mode)
1012{
1013	u32 *cs;
1014
1015	cs = intel_ring_begin(rq, 2);
1016	if (IS_ERR(cs))
1017		return PTR_ERR(cs);
1018
1019	*cs++ = MI_FLUSH;
1020	*cs++ = MI_NOOP;
1021	intel_ring_advance(rq, cs);
1022	return 0;
1023}
1024
1025static void
1026gen6_irq_enable(struct intel_engine_cs *engine)
1027{
1028	ENGINE_WRITE(engine, RING_IMR,
1029		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
1030
1031	/* Flush/delay to ensure the RING_IMR is active before the GT IMR */
1032	ENGINE_POSTING_READ(engine, RING_IMR);
1033
1034	gen5_enable_gt_irq(engine->i915, engine->irq_enable_mask);
1035}
1036
1037static void
1038gen6_irq_disable(struct intel_engine_cs *engine)
1039{
1040	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
1041	gen5_disable_gt_irq(engine->i915, engine->irq_enable_mask);
1042}
1043
1044static void
1045hsw_vebox_irq_enable(struct intel_engine_cs *engine)
1046{
1047	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_enable_mask);
1048
1049	/* Flush/delay to ensure the RING_IMR is active before the GT IMR */
1050	ENGINE_POSTING_READ(engine, RING_IMR);
1051
1052	gen6_unmask_pm_irq(engine->i915, engine->irq_enable_mask);
1053}
1054
1055static void
1056hsw_vebox_irq_disable(struct intel_engine_cs *engine)
1057{
1058	ENGINE_WRITE(engine, RING_IMR, ~0);
1059	gen6_mask_pm_irq(engine->i915, engine->irq_enable_mask);
1060}
1061
1062static int
1063i965_emit_bb_start(struct i915_request *rq,
1064		   u64 offset, u32 length,
1065		   unsigned int dispatch_flags)
1066{
1067	u32 *cs;
1068
1069	cs = intel_ring_begin(rq, 2);
1070	if (IS_ERR(cs))
1071		return PTR_ERR(cs);
1072
1073	*cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT | (dispatch_flags &
1074		I915_DISPATCH_SECURE ? 0 : MI_BATCH_NON_SECURE_I965);
1075	*cs++ = offset;
1076	intel_ring_advance(rq, cs);
1077
1078	return 0;
1079}
1080
1081/* Just userspace ABI convention to limit the wa batch bo to a resonable size */
1082#define I830_BATCH_LIMIT SZ_256K
1083#define I830_TLB_ENTRIES (2)
1084#define I830_WA_SIZE max(I830_TLB_ENTRIES*4096, I830_BATCH_LIMIT)
1085static int
1086i830_emit_bb_start(struct i915_request *rq,
1087		   u64 offset, u32 len,
1088		   unsigned int dispatch_flags)
1089{
1090	u32 *cs, cs_offset = i915_scratch_offset(rq->i915);
1091
1092	GEM_BUG_ON(rq->i915->gt.scratch->size < I830_WA_SIZE);
1093
1094	cs = intel_ring_begin(rq, 6);
1095	if (IS_ERR(cs))
1096		return PTR_ERR(cs);
1097
1098	/* Evict the invalid PTE TLBs */
1099	*cs++ = COLOR_BLT_CMD | BLT_WRITE_RGBA;
1100	*cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | 4096;
1101	*cs++ = I830_TLB_ENTRIES << 16 | 4; /* load each page */
1102	*cs++ = cs_offset;
1103	*cs++ = 0xdeadbeef;
1104	*cs++ = MI_NOOP;
1105	intel_ring_advance(rq, cs);
1106
1107	if ((dispatch_flags & I915_DISPATCH_PINNED) == 0) {
1108		if (len > I830_BATCH_LIMIT)
1109			return -ENOSPC;
1110
1111		cs = intel_ring_begin(rq, 6 + 2);
1112		if (IS_ERR(cs))
1113			return PTR_ERR(cs);
1114
1115		/* Blit the batch (which has now all relocs applied) to the
1116		 * stable batch scratch bo area (so that the CS never
1117		 * stumbles over its tlb invalidation bug) ...
1118		 */
1119		*cs++ = SRC_COPY_BLT_CMD | BLT_WRITE_RGBA;
1120		*cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | 4096;
1121		*cs++ = DIV_ROUND_UP(len, 4096) << 16 | 4096;
1122		*cs++ = cs_offset;
1123		*cs++ = 4096;
1124		*cs++ = offset;
1125
1126		*cs++ = MI_FLUSH;
1127		*cs++ = MI_NOOP;
1128		intel_ring_advance(rq, cs);
1129
1130		/* ... and execute it. */
1131		offset = cs_offset;
1132	}
1133
1134	cs = intel_ring_begin(rq, 2);
1135	if (IS_ERR(cs))
1136		return PTR_ERR(cs);
1137
1138	*cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
1139	*cs++ = offset | (dispatch_flags & I915_DISPATCH_SECURE ? 0 :
1140		MI_BATCH_NON_SECURE);
1141	intel_ring_advance(rq, cs);
1142
1143	return 0;
1144}
1145
1146static int
1147i915_emit_bb_start(struct i915_request *rq,
1148		   u64 offset, u32 len,
1149		   unsigned int dispatch_flags)
1150{
1151	u32 *cs;
1152
1153	cs = intel_ring_begin(rq, 2);
1154	if (IS_ERR(cs))
1155		return PTR_ERR(cs);
1156
1157	*cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
1158	*cs++ = offset | (dispatch_flags & I915_DISPATCH_SECURE ? 0 :
1159		MI_BATCH_NON_SECURE);
1160	intel_ring_advance(rq, cs);
1161
1162	return 0;
1163}
1164
1165int intel_ring_pin(struct intel_ring *ring)
1166{
1167	struct i915_vma *vma = ring->vma;
1168	enum i915_map_type map = i915_coherent_map_type(vma->vm->i915);
1169	unsigned int flags;
1170	void *addr;
1171	int ret;
1172
1173	GEM_BUG_ON(ring->vaddr);
1174
1175	ret = i915_timeline_pin(ring->timeline);
1176	if (ret)
1177		return ret;
1178
1179	flags = PIN_GLOBAL;
1180
1181	/* Ring wraparound at offset 0 sometimes hangs. No idea why. */
1182	flags |= PIN_OFFSET_BIAS | i915_ggtt_pin_bias(vma);
1183
1184	if (vma->obj->stolen)
1185		flags |= PIN_MAPPABLE;
1186	else
1187		flags |= PIN_HIGH;
1188
1189	ret = i915_vma_pin(vma, 0, 0, flags);
1190	if (unlikely(ret))
1191		goto unpin_timeline;
1192
1193	if (i915_vma_is_map_and_fenceable(vma))
1194		addr = (void __force *)i915_vma_pin_iomap(vma);
1195	else
1196		addr = i915_gem_object_pin_map(vma->obj, map);
1197	if (IS_ERR(addr)) {
1198		ret = PTR_ERR(addr);
1199		goto unpin_ring;
1200	}
1201
1202	vma->obj->pin_global++;
1203
1204	ring->vaddr = addr;
1205	return 0;
1206
1207unpin_ring:
1208	i915_vma_unpin(vma);
1209unpin_timeline:
1210	i915_timeline_unpin(ring->timeline);
1211	return ret;
1212}
1213
1214void intel_ring_reset(struct intel_ring *ring, u32 tail)
1215{
1216	GEM_BUG_ON(!intel_ring_offset_valid(ring, tail));
1217
1218	ring->tail = tail;
1219	ring->head = tail;
1220	ring->emit = tail;
1221	intel_ring_update_space(ring);
1222}
1223
1224void intel_ring_unpin(struct intel_ring *ring)
1225{
1226	GEM_BUG_ON(!ring->vma);
1227	GEM_BUG_ON(!ring->vaddr);
1228
1229	/* Discard any unused bytes beyond that submitted to hw. */
1230	intel_ring_reset(ring, ring->tail);
1231
1232	if (i915_vma_is_map_and_fenceable(ring->vma))
1233		i915_vma_unpin_iomap(ring->vma);
1234	else
1235		i915_gem_object_unpin_map(ring->vma->obj);
1236	ring->vaddr = NULL;
1237
1238	ring->vma->obj->pin_global--;
1239	i915_vma_unpin(ring->vma);
1240
1241	i915_timeline_unpin(ring->timeline);
1242}
1243
1244static struct i915_vma *
1245intel_ring_create_vma(struct drm_i915_private *dev_priv, int size)
1246{
1247	struct i915_address_space *vm = &dev_priv->ggtt.vm;
1248	struct drm_i915_gem_object *obj;
1249	struct i915_vma *vma;
1250
1251	obj = i915_gem_object_create_stolen(dev_priv, size);
1252	if (!obj)
1253		obj = i915_gem_object_create_internal(dev_priv, size);
1254	if (IS_ERR(obj))
1255		return ERR_CAST(obj);
1256
1257	/*
1258	 * Mark ring buffers as read-only from GPU side (so no stray overwrites)
1259	 * if supported by the platform's GGTT.
1260	 */
1261	if (vm->has_read_only)
1262		i915_gem_object_set_readonly(obj);
1263
1264	vma = i915_vma_instance(obj, vm, NULL);
1265	if (IS_ERR(vma))
1266		goto err;
1267
1268	return vma;
1269
1270err:
1271	i915_gem_object_put(obj);
1272	return vma;
1273}
1274
1275struct intel_ring *
1276intel_engine_create_ring(struct intel_engine_cs *engine,
1277			 struct i915_timeline *timeline,
1278			 int size)
1279{
1280	struct intel_ring *ring;
1281	struct i915_vma *vma;
1282
1283	GEM_BUG_ON(!is_power_of_2(size));
1284	GEM_BUG_ON(RING_CTL_SIZE(size) & ~RING_NR_PAGES);
1285	GEM_BUG_ON(timeline == &engine->timeline);
1286	lockdep_assert_held(&engine->i915->drm.struct_mutex);
1287
1288	ring = kzalloc(sizeof(*ring), GFP_KERNEL);
1289	if (!ring)
1290		return ERR_PTR(-ENOMEM);
1291
1292	kref_init(&ring->ref);
1293	INIT_LIST_HEAD(&ring->request_list);
1294	ring->timeline = i915_timeline_get(timeline);
1295
1296	ring->size = size;
1297	/* Workaround an erratum on the i830 which causes a hang if
1298	 * the TAIL pointer points to within the last 2 cachelines
1299	 * of the buffer.
1300	 */
1301	ring->effective_size = size;
1302	if (IS_I830(engine->i915) || IS_I845G(engine->i915))
1303		ring->effective_size -= 2 * CACHELINE_BYTES;
1304
1305	intel_ring_update_space(ring);
1306
1307	vma = intel_ring_create_vma(engine->i915, size);
1308	if (IS_ERR(vma)) {
1309		kfree(ring);
1310		return ERR_CAST(vma);
1311	}
1312	ring->vma = vma;
1313
1314	return ring;
1315}
1316
1317void intel_ring_free(struct kref *ref)
1318{
1319	struct intel_ring *ring = container_of(ref, typeof(*ring), ref);
1320	struct drm_i915_gem_object *obj = ring->vma->obj;
1321
1322	i915_vma_close(ring->vma);
1323	__i915_gem_object_release_unless_active(obj);
1324
1325	i915_timeline_put(ring->timeline);
1326	kfree(ring);
1327}
1328
1329static void __ring_context_fini(struct intel_context *ce)
1330{
1331	GEM_BUG_ON(i915_gem_object_is_active(ce->state->obj));
1332	i915_gem_object_put(ce->state->obj);
1333}
1334
1335static void ring_context_destroy(struct kref *ref)
1336{
1337	struct intel_context *ce = container_of(ref, typeof(*ce), ref);
1338
1339	GEM_BUG_ON(intel_context_is_pinned(ce));
1340
1341	if (ce->state)
1342		__ring_context_fini(ce);
1343
1344	intel_context_free(ce);
1345}
1346
1347static int __context_pin_ppgtt(struct i915_gem_context *ctx)
1348{
1349	struct i915_hw_ppgtt *ppgtt;
1350	int err = 0;
1351
1352	ppgtt = ctx->ppgtt ?: ctx->i915->mm.aliasing_ppgtt;
1353	if (ppgtt)
1354		err = gen6_ppgtt_pin(ppgtt);
1355
1356	return err;
1357}
1358
1359static void __context_unpin_ppgtt(struct i915_gem_context *ctx)
1360{
1361	struct i915_hw_ppgtt *ppgtt;
1362
1363	ppgtt = ctx->ppgtt ?: ctx->i915->mm.aliasing_ppgtt;
1364	if (ppgtt)
1365		gen6_ppgtt_unpin(ppgtt);
1366}
1367
1368static int __context_pin(struct intel_context *ce)
1369{
1370	struct i915_vma *vma;
1371	int err;
1372
1373	vma = ce->state;
1374	if (!vma)
1375		return 0;
1376
1377	err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
1378	if (err)
1379		return err;
1380
1381	/*
1382	 * And mark is as a globally pinned object to let the shrinker know
1383	 * it cannot reclaim the object until we release it.
1384	 */
1385	vma->obj->pin_global++;
1386	vma->obj->mm.dirty = true;
1387
1388	return 0;
1389}
1390
1391static void __context_unpin(struct intel_context *ce)
1392{
1393	struct i915_vma *vma;
1394
1395	vma = ce->state;
1396	if (!vma)
1397		return;
1398
1399	vma->obj->pin_global--;
1400	i915_vma_unpin(vma);
1401}
1402
1403static void ring_context_unpin(struct intel_context *ce)
1404{
1405	__context_unpin_ppgtt(ce->gem_context);
1406	__context_unpin(ce);
1407}
1408
1409static struct i915_vma *
1410alloc_context_vma(struct intel_engine_cs *engine)
1411{
1412	struct drm_i915_private *i915 = engine->i915;
1413	struct drm_i915_gem_object *obj;
1414	struct i915_vma *vma;
1415	int err;
1416
1417	obj = i915_gem_object_create(i915, engine->context_size);
1418	if (IS_ERR(obj))
1419		return ERR_CAST(obj);
1420
1421	/*
1422	 * Try to make the context utilize L3 as well as LLC.
1423	 *
1424	 * On VLV we don't have L3 controls in the PTEs so we
1425	 * shouldn't touch the cache level, especially as that
1426	 * would make the object snooped which might have a
1427	 * negative performance impact.
1428	 *
1429	 * Snooping is required on non-llc platforms in execlist
1430	 * mode, but since all GGTT accesses use PAT entry 0 we
1431	 * get snooping anyway regardless of cache_level.
1432	 *
1433	 * This is only applicable for Ivy Bridge devices since
1434	 * later platforms don't have L3 control bits in the PTE.
1435	 */
1436	if (IS_IVYBRIDGE(i915))
1437		i915_gem_object_set_cache_coherency(obj, I915_CACHE_L3_LLC);
1438
1439	if (engine->default_state) {
1440		void *defaults, *vaddr;
1441
1442		vaddr = i915_gem_object_pin_map(obj, I915_MAP_WB);
1443		if (IS_ERR(vaddr)) {
1444			err = PTR_ERR(vaddr);
1445			goto err_obj;
1446		}
1447
1448		defaults = i915_gem_object_pin_map(engine->default_state,
1449						   I915_MAP_WB);
1450		if (IS_ERR(defaults)) {
1451			err = PTR_ERR(defaults);
1452			goto err_map;
1453		}
1454
1455		memcpy(vaddr, defaults, engine->context_size);
1456		i915_gem_object_unpin_map(engine->default_state);
1457
1458		i915_gem_object_flush_map(obj);
1459		i915_gem_object_unpin_map(obj);
1460	}
1461
1462	vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL);
1463	if (IS_ERR(vma)) {
1464		err = PTR_ERR(vma);
1465		goto err_obj;
1466	}
1467
1468	return vma;
1469
1470err_map:
1471	i915_gem_object_unpin_map(obj);
1472err_obj:
1473	i915_gem_object_put(obj);
1474	return ERR_PTR(err);
1475}
1476
1477static int ring_context_pin(struct intel_context *ce)
1478{
1479	struct intel_engine_cs *engine = ce->engine;
1480	int err;
1481
1482	/* One ringbuffer to rule them all */
1483	GEM_BUG_ON(!engine->buffer);
1484	ce->ring = engine->buffer;
1485
1486	if (!ce->state && engine->context_size) {
1487		struct i915_vma *vma;
1488
1489		vma = alloc_context_vma(engine);
1490		if (IS_ERR(vma))
1491			return PTR_ERR(vma);
1492
1493		ce->state = vma;
1494	}
1495
1496	err = __context_pin(ce);
1497	if (err)
1498		return err;
1499
1500	err = __context_pin_ppgtt(ce->gem_context);
1501	if (err)
1502		goto err_unpin;
1503
1504	return 0;
1505
1506err_unpin:
1507	__context_unpin(ce);
1508	return err;
1509}
1510
1511static void ring_context_reset(struct intel_context *ce)
1512{
1513	intel_ring_reset(ce->ring, 0);
1514}
1515
1516static const struct intel_context_ops ring_context_ops = {
1517	.pin = ring_context_pin,
1518	.unpin = ring_context_unpin,
1519
1520	.reset = ring_context_reset,
1521	.destroy = ring_context_destroy,
1522};
1523
1524static int intel_init_ring_buffer(struct intel_engine_cs *engine)
1525{
1526	struct i915_timeline *timeline;
1527	struct intel_ring *ring;
1528	int err;
1529
1530	err = intel_engine_setup_common(engine);
1531	if (err)
1532		return err;
1533
1534	timeline = i915_timeline_create(engine->i915, engine->status_page.vma);
1535	if (IS_ERR(timeline)) {
1536		err = PTR_ERR(timeline);
1537		goto err;
1538	}
1539	GEM_BUG_ON(timeline->has_initial_breadcrumb);
1540
1541	ring = intel_engine_create_ring(engine, timeline, 32 * PAGE_SIZE);
1542	i915_timeline_put(timeline);
1543	if (IS_ERR(ring)) {
1544		err = PTR_ERR(ring);
1545		goto err;
1546	}
1547
1548	err = intel_ring_pin(ring);
1549	if (err)
1550		goto err_ring;
1551
1552	GEM_BUG_ON(engine->buffer);
1553	engine->buffer = ring;
1554
1555	err = intel_engine_init_common(engine);
1556	if (err)
1557		goto err_unpin;
1558
1559	GEM_BUG_ON(ring->timeline->hwsp_ggtt != engine->status_page.vma);
1560
1561	return 0;
1562
1563err_unpin:
1564	intel_ring_unpin(ring);
1565err_ring:
1566	intel_ring_put(ring);
1567err:
1568	intel_engine_cleanup_common(engine);
1569	return err;
1570}
1571
1572void intel_engine_cleanup(struct intel_engine_cs *engine)
1573{
1574	struct drm_i915_private *dev_priv = engine->i915;
1575
1576	WARN_ON(INTEL_GEN(dev_priv) > 2 &&
1577		(ENGINE_READ(engine, RING_MI_MODE) & MODE_IDLE) == 0);
1578
1579	intel_ring_unpin(engine->buffer);
1580	intel_ring_put(engine->buffer);
1581
1582	if (engine->cleanup)
1583		engine->cleanup(engine);
1584
1585	intel_engine_cleanup_common(engine);
1586
1587	dev_priv->engine[engine->id] = NULL;
1588	kfree(engine);
1589}
1590
1591static int load_pd_dir(struct i915_request *rq,
1592		       const struct i915_hw_ppgtt *ppgtt)
1593{
1594	const struct intel_engine_cs * const engine = rq->engine;
1595	u32 *cs;
1596
1597	cs = intel_ring_begin(rq, 6);
1598	if (IS_ERR(cs))
1599		return PTR_ERR(cs);
1600
1601	*cs++ = MI_LOAD_REGISTER_IMM(1);
1602	*cs++ = i915_mmio_reg_offset(RING_PP_DIR_DCLV(engine->mmio_base));
1603	*cs++ = PP_DIR_DCLV_2G;
1604
1605	*cs++ = MI_LOAD_REGISTER_IMM(1);
1606	*cs++ = i915_mmio_reg_offset(RING_PP_DIR_BASE(engine->mmio_base));
1607	*cs++ = ppgtt->pd.base.ggtt_offset << 10;
1608
1609	intel_ring_advance(rq, cs);
1610
1611	return 0;
1612}
1613
1614static int flush_pd_dir(struct i915_request *rq)
1615{
1616	const struct intel_engine_cs * const engine = rq->engine;
1617	u32 *cs;
1618
1619	cs = intel_ring_begin(rq, 4);
1620	if (IS_ERR(cs))
1621		return PTR_ERR(cs);
1622
1623	/* Stall until the page table load is complete */
1624	*cs++ = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
1625	*cs++ = i915_mmio_reg_offset(RING_PP_DIR_BASE(engine->mmio_base));
1626	*cs++ = i915_scratch_offset(rq->i915);
1627	*cs++ = MI_NOOP;
1628
1629	intel_ring_advance(rq, cs);
1630	return 0;
1631}
1632
1633static inline int mi_set_context(struct i915_request *rq, u32 flags)
1634{
1635	struct drm_i915_private *i915 = rq->i915;
1636	struct intel_engine_cs *engine = rq->engine;
1637	enum intel_engine_id id;
1638	const int num_engines =
1639		IS_HSW_GT1(i915) ? RUNTIME_INFO(i915)->num_engines - 1 : 0;
1640	bool force_restore = false;
1641	int len;
1642	u32 *cs;
1643
1644	flags |= MI_MM_SPACE_GTT;
1645	if (IS_HASWELL(i915))
1646		/* These flags are for resource streamer on HSW+ */
1647		flags |= HSW_MI_RS_SAVE_STATE_EN | HSW_MI_RS_RESTORE_STATE_EN;
1648	else
1649		flags |= MI_SAVE_EXT_STATE_EN | MI_RESTORE_EXT_STATE_EN;
1650
1651	len = 4;
1652	if (IS_GEN(i915, 7))
1653		len += 2 + (num_engines ? 4 * num_engines + 6 : 0);
1654	if (flags & MI_FORCE_RESTORE) {
1655		GEM_BUG_ON(flags & MI_RESTORE_INHIBIT);
1656		flags &= ~MI_FORCE_RESTORE;
1657		force_restore = true;
1658		len += 2;
1659	}
1660
1661	cs = intel_ring_begin(rq, len);
1662	if (IS_ERR(cs))
1663		return PTR_ERR(cs);
1664
1665	/* WaProgramMiArbOnOffAroundMiSetContext:ivb,vlv,hsw,bdw,chv */
1666	if (IS_GEN(i915, 7)) {
1667		*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1668		if (num_engines) {
1669			struct intel_engine_cs *signaller;
1670
1671			*cs++ = MI_LOAD_REGISTER_IMM(num_engines);
1672			for_each_engine(signaller, i915, id) {
1673				if (signaller == engine)
1674					continue;
1675
1676				*cs++ = i915_mmio_reg_offset(
1677					   RING_PSMI_CTL(signaller->mmio_base));
1678				*cs++ = _MASKED_BIT_ENABLE(
1679						GEN6_PSMI_SLEEP_MSG_DISABLE);
1680			}
1681		}
1682	}
1683
1684	if (force_restore) {
1685		/*
1686		 * The HW doesn't handle being told to restore the current
1687		 * context very well. Quite often it likes goes to go off and
1688		 * sulk, especially when it is meant to be reloading PP_DIR.
1689		 * A very simple fix to force the reload is to simply switch
1690		 * away from the current context and back again.
1691		 *
1692		 * Note that the kernel_context will contain random state
1693		 * following the INHIBIT_RESTORE. We accept this since we
1694		 * never use the kernel_context state; it is merely a
1695		 * placeholder we use to flush other contexts.
1696		 */
1697		*cs++ = MI_SET_CONTEXT;
1698		*cs++ = i915_ggtt_offset(engine->kernel_context->state) |
1699			MI_MM_SPACE_GTT |
1700			MI_RESTORE_INHIBIT;
1701	}
1702
1703	*cs++ = MI_NOOP;
1704	*cs++ = MI_SET_CONTEXT;
1705	*cs++ = i915_ggtt_offset(rq->hw_context->state) | flags;
1706	/*
1707	 * w/a: MI_SET_CONTEXT must always be followed by MI_NOOP
1708	 * WaMiSetContext_Hang:snb,ivb,vlv
1709	 */
1710	*cs++ = MI_NOOP;
1711
1712	if (IS_GEN(i915, 7)) {
1713		if (num_engines) {
1714			struct intel_engine_cs *signaller;
1715			i915_reg_t last_reg = {}; /* keep gcc quiet */
1716
1717			*cs++ = MI_LOAD_REGISTER_IMM(num_engines);
1718			for_each_engine(signaller, i915, id) {
1719				if (signaller == engine)
1720					continue;
1721
1722				last_reg = RING_PSMI_CTL(signaller->mmio_base);
1723				*cs++ = i915_mmio_reg_offset(last_reg);
1724				*cs++ = _MASKED_BIT_DISABLE(
1725						GEN6_PSMI_SLEEP_MSG_DISABLE);
1726			}
1727
1728			/* Insert a delay before the next switch! */
1729			*cs++ = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
1730			*cs++ = i915_mmio_reg_offset(last_reg);
1731			*cs++ = i915_scratch_offset(rq->i915);
1732			*cs++ = MI_NOOP;
1733		}
1734		*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1735	}
1736
1737	intel_ring_advance(rq, cs);
1738
1739	return 0;
1740}
1741
1742static int remap_l3(struct i915_request *rq, int slice)
1743{
1744	u32 *cs, *remap_info = rq->i915->l3_parity.remap_info[slice];
1745	int i;
1746
1747	if (!remap_info)
1748		return 0;
1749
1750	cs = intel_ring_begin(rq, GEN7_L3LOG_SIZE/4 * 2 + 2);
1751	if (IS_ERR(cs))
1752		return PTR_ERR(cs);
1753
1754	/*
1755	 * Note: We do not worry about the concurrent register cacheline hang
1756	 * here because no other code should access these registers other than
1757	 * at initialization time.
1758	 */
1759	*cs++ = MI_LOAD_REGISTER_IMM(GEN7_L3LOG_SIZE/4);
1760	for (i = 0; i < GEN7_L3LOG_SIZE/4; i++) {
1761		*cs++ = i915_mmio_reg_offset(GEN7_L3LOG(slice, i));
1762		*cs++ = remap_info[i];
1763	}
1764	*cs++ = MI_NOOP;
1765	intel_ring_advance(rq, cs);
1766
1767	return 0;
1768}
1769
1770static int switch_context(struct i915_request *rq)
1771{
1772	struct intel_engine_cs *engine = rq->engine;
1773	struct i915_gem_context *ctx = rq->gem_context;
1774	struct i915_hw_ppgtt *ppgtt = ctx->ppgtt ?: rq->i915->mm.aliasing_ppgtt;
1775	unsigned int unwind_mm = 0;
1776	u32 hw_flags = 0;
1777	int ret, i;
1778
1779	lockdep_assert_held(&rq->i915->drm.struct_mutex);
1780	GEM_BUG_ON(HAS_EXECLISTS(rq->i915));
1781
1782	if (ppgtt) {
1783		int loops;
1784
1785		/*
1786		 * Baytail takes a little more convincing that it really needs
1787		 * to reload the PD between contexts. It is not just a little
1788		 * longer, as adding more stalls after the load_pd_dir (i.e.
1789		 * adding a long loop around flush_pd_dir) is not as effective
1790		 * as reloading the PD umpteen times. 32 is derived from
1791		 * experimentation (gem_exec_parallel/fds) and has no good
1792		 * explanation.
1793		 */
1794		loops = 1;
1795		if (engine->id == BCS0 && IS_VALLEYVIEW(engine->i915))
1796			loops = 32;
1797
1798		do {
1799			ret = load_pd_dir(rq, ppgtt);
1800			if (ret)
1801				goto err;
1802		} while (--loops);
1803
1804		if (ppgtt->pd_dirty_engines & engine->mask) {
1805			unwind_mm = engine->mask;
1806			ppgtt->pd_dirty_engines &= ~unwind_mm;
1807			hw_flags = MI_FORCE_RESTORE;
1808		}
1809	}
1810
1811	if (rq->hw_context->state) {
1812		GEM_BUG_ON(engine->id != RCS0);
1813
1814		/*
1815		 * The kernel context(s) is treated as pure scratch and is not
1816		 * expected to retain any state (as we sacrifice it during
1817		 * suspend and on resume it may be corrupted). This is ok,
1818		 * as nothing actually executes using the kernel context; it
1819		 * is purely used for flushing user contexts.
1820		 */
1821		if (i915_gem_context_is_kernel(ctx))
1822			hw_flags = MI_RESTORE_INHIBIT;
1823
1824		ret = mi_set_context(rq, hw_flags);
1825		if (ret)
1826			goto err_mm;
1827	}
1828
1829	if (ppgtt) {
1830		ret = engine->emit_flush(rq, EMIT_INVALIDATE);
1831		if (ret)
1832			goto err_mm;
1833
1834		ret = flush_pd_dir(rq);
1835		if (ret)
1836			goto err_mm;
1837
1838		/*
1839		 * Not only do we need a full barrier (post-sync write) after
1840		 * invalidating the TLBs, but we need to wait a little bit
1841		 * longer. Whether this is merely delaying us, or the
1842		 * subsequent flush is a key part of serialising with the
1843		 * post-sync op, this extra pass appears vital before a
1844		 * mm switch!
1845		 */
1846		ret = engine->emit_flush(rq, EMIT_INVALIDATE);
1847		if (ret)
1848			goto err_mm;
1849
1850		ret = engine->emit_flush(rq, EMIT_FLUSH);
1851		if (ret)
1852			goto err_mm;
1853	}
1854
1855	if (ctx->remap_slice) {
1856		for (i = 0; i < MAX_L3_SLICES; i++) {
1857			if (!(ctx->remap_slice & BIT(i)))
1858				continue;
1859
1860			ret = remap_l3(rq, i);
1861			if (ret)
1862				goto err_mm;
1863		}
1864
1865		ctx->remap_slice = 0;
1866	}
1867
1868	return 0;
1869
1870err_mm:
1871	if (unwind_mm)
1872		ppgtt->pd_dirty_engines |= unwind_mm;
1873err:
1874	return ret;
1875}
1876
1877static int ring_request_alloc(struct i915_request *request)
1878{
1879	int ret;
1880
1881	GEM_BUG_ON(!intel_context_is_pinned(request->hw_context));
1882	GEM_BUG_ON(request->timeline->has_initial_breadcrumb);
1883
1884	/*
1885	 * Flush enough space to reduce the likelihood of waiting after
1886	 * we start building the request - in which case we will just
1887	 * have to repeat work.
1888	 */
1889	request->reserved_space += LEGACY_REQUEST_SIZE;
1890
1891	/* Unconditionally invalidate GPU caches and TLBs. */
1892	ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
1893	if (ret)
1894		return ret;
1895
1896	ret = switch_context(request);
1897	if (ret)
1898		return ret;
1899
1900	request->reserved_space -= LEGACY_REQUEST_SIZE;
1901	return 0;
1902}
1903
1904static noinline int wait_for_space(struct intel_ring *ring, unsigned int bytes)
1905{
1906	struct i915_request *target;
1907	long timeout;
1908
1909	lockdep_assert_held(&ring->vma->vm->i915->drm.struct_mutex);
1910
1911	if (intel_ring_update_space(ring) >= bytes)
1912		return 0;
1913
1914	GEM_BUG_ON(list_empty(&ring->request_list));
1915	list_for_each_entry(target, &ring->request_list, ring_link) {
1916		/* Would completion of this request free enough space? */
1917		if (bytes <= __intel_ring_space(target->postfix,
1918						ring->emit, ring->size))
1919			break;
1920	}
1921
1922	if (WARN_ON(&target->ring_link == &ring->request_list))
1923		return -ENOSPC;
1924
1925	timeout = i915_request_wait(target,
1926				    I915_WAIT_INTERRUPTIBLE | I915_WAIT_LOCKED,
1927				    MAX_SCHEDULE_TIMEOUT);
1928	if (timeout < 0)
1929		return timeout;
1930
1931	i915_request_retire_upto(target);
1932
1933	intel_ring_update_space(ring);
1934	GEM_BUG_ON(ring->space < bytes);
1935	return 0;
1936}
1937
1938u32 *intel_ring_begin(struct i915_request *rq, unsigned int num_dwords)
1939{
1940	struct intel_ring *ring = rq->ring;
1941	const unsigned int remain_usable = ring->effective_size - ring->emit;
1942	const unsigned int bytes = num_dwords * sizeof(u32);
1943	unsigned int need_wrap = 0;
1944	unsigned int total_bytes;
1945	u32 *cs;
1946
1947	/* Packets must be qword aligned. */
1948	GEM_BUG_ON(num_dwords & 1);
1949
1950	total_bytes = bytes + rq->reserved_space;
1951	GEM_BUG_ON(total_bytes > ring->effective_size);
1952
1953	if (unlikely(total_bytes > remain_usable)) {
1954		const int remain_actual = ring->size - ring->emit;
1955
1956		if (bytes > remain_usable) {
1957			/*
1958			 * Not enough space for the basic request. So need to
1959			 * flush out the remainder and then wait for
1960			 * base + reserved.
1961			 */
1962			total_bytes += remain_actual;
1963			need_wrap = remain_actual | 1;
1964		} else  {
1965			/*
1966			 * The base request will fit but the reserved space
1967			 * falls off the end. So we don't need an immediate
1968			 * wrap and only need to effectively wait for the
1969			 * reserved size from the start of ringbuffer.
1970			 */
1971			total_bytes = rq->reserved_space + remain_actual;
1972		}
1973	}
1974
1975	if (unlikely(total_bytes > ring->space)) {
1976		int ret;
1977
1978		/*
1979		 * Space is reserved in the ringbuffer for finalising the
1980		 * request, as that cannot be allowed to fail. During request
1981		 * finalisation, reserved_space is set to 0 to stop the
1982		 * overallocation and the assumption is that then we never need
1983		 * to wait (which has the risk of failing with EINTR).
1984		 *
1985		 * See also i915_request_alloc() and i915_request_add().
1986		 */
1987		GEM_BUG_ON(!rq->reserved_space);
1988
1989		ret = wait_for_space(ring, total_bytes);
1990		if (unlikely(ret))
1991			return ERR_PTR(ret);
1992	}
1993
1994	if (unlikely(need_wrap)) {
1995		need_wrap &= ~1;
1996		GEM_BUG_ON(need_wrap > ring->space);
1997		GEM_BUG_ON(ring->emit + need_wrap > ring->size);
1998		GEM_BUG_ON(!IS_ALIGNED(need_wrap, sizeof(u64)));
1999
2000		/* Fill the tail with MI_NOOP */
2001		memset64(ring->vaddr + ring->emit, 0, need_wrap / sizeof(u64));
2002		ring->space -= need_wrap;
2003		ring->emit = 0;
2004	}
2005
2006	GEM_BUG_ON(ring->emit > ring->size - bytes);
2007	GEM_BUG_ON(ring->space < bytes);
2008	cs = ring->vaddr + ring->emit;
2009	GEM_DEBUG_EXEC(memset32(cs, POISON_INUSE, bytes / sizeof(*cs)));
2010	ring->emit += bytes;
2011	ring->space -= bytes;
2012
2013	return cs;
2014}
2015
2016/* Align the ring tail to a cacheline boundary */
2017int intel_ring_cacheline_align(struct i915_request *rq)
2018{
2019	int num_dwords;
2020	void *cs;
2021
2022	num_dwords = (rq->ring->emit & (CACHELINE_BYTES - 1)) / sizeof(u32);
2023	if (num_dwords == 0)
2024		return 0;
2025
2026	num_dwords = CACHELINE_DWORDS - num_dwords;
2027	GEM_BUG_ON(num_dwords & 1);
2028
2029	cs = intel_ring_begin(rq, num_dwords);
2030	if (IS_ERR(cs))
2031		return PTR_ERR(cs);
2032
2033	memset64(cs, (u64)MI_NOOP << 32 | MI_NOOP, num_dwords / 2);
2034	intel_ring_advance(rq, cs);
2035
2036	GEM_BUG_ON(rq->ring->emit & (CACHELINE_BYTES - 1));
2037	return 0;
2038}
2039
2040static void gen6_bsd_submit_request(struct i915_request *request)
2041{
2042	struct intel_uncore *uncore = request->engine->uncore;
2043
2044	intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL);
2045
2046       /* Every tail move must follow the sequence below */
2047
2048	/* Disable notification that the ring is IDLE. The GT
2049	 * will then assume that it is busy and bring it out of rc6.
2050	 */
2051	intel_uncore_write_fw(uncore, GEN6_BSD_SLEEP_PSMI_CONTROL,
2052			      _MASKED_BIT_ENABLE(GEN6_BSD_SLEEP_MSG_DISABLE));
2053
2054	/* Clear the context id. Here be magic! */
2055	intel_uncore_write64_fw(uncore, GEN6_BSD_RNCID, 0x0);
2056
2057	/* Wait for the ring not to be idle, i.e. for it to wake up. */
2058	if (__intel_wait_for_register_fw(uncore,
2059					 GEN6_BSD_SLEEP_PSMI_CONTROL,
2060					 GEN6_BSD_SLEEP_INDICATOR,
2061					 0,
2062					 1000, 0, NULL))
2063		DRM_ERROR("timed out waiting for the BSD ring to wake up\n");
2064
2065	/* Now that the ring is fully powered up, update the tail */
2066	i9xx_submit_request(request);
2067
2068	/* Let the ring send IDLE messages to the GT again,
2069	 * and so let it sleep to conserve power when idle.
2070	 */
2071	intel_uncore_write_fw(uncore, GEN6_BSD_SLEEP_PSMI_CONTROL,
2072			      _MASKED_BIT_DISABLE(GEN6_BSD_SLEEP_MSG_DISABLE));
2073
2074	intel_uncore_forcewake_put(uncore, FORCEWAKE_ALL);
2075}
2076
2077static int mi_flush_dw(struct i915_request *rq, u32 flags)
2078{
2079	u32 cmd, *cs;
2080
2081	cs = intel_ring_begin(rq, 4);
2082	if (IS_ERR(cs))
2083		return PTR_ERR(cs);
2084
2085	cmd = MI_FLUSH_DW;
2086
2087	/*
2088	 * We always require a command barrier so that subsequent
2089	 * commands, such as breadcrumb interrupts, are strictly ordered
2090	 * wrt the contents of the write cache being flushed to memory
2091	 * (and thus being coherent from the CPU).
2092	 */
2093	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
2094
2095	/*
2096	 * Bspec vol 1c.3 - blitter engine command streamer:
2097	 * "If ENABLED, all TLBs will be invalidated once the flush
2098	 * operation is complete. This bit is only valid when the
2099	 * Post-Sync Operation field is a value of 1h or 3h."
2100	 */
2101	cmd |= flags;
2102
2103	*cs++ = cmd;
2104	*cs++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
2105	*cs++ = 0;
2106	*cs++ = MI_NOOP;
2107
2108	intel_ring_advance(rq, cs);
2109
2110	return 0;
2111}
2112
2113static int gen6_flush_dw(struct i915_request *rq, u32 mode, u32 invflags)
2114{
2115	return mi_flush_dw(rq, mode & EMIT_INVALIDATE ? invflags : 0);
2116}
2117
2118static int gen6_bsd_ring_flush(struct i915_request *rq, u32 mode)
2119{
2120	return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB | MI_INVALIDATE_BSD);
2121}
2122
2123static int
2124hsw_emit_bb_start(struct i915_request *rq,
2125		  u64 offset, u32 len,
2126		  unsigned int dispatch_flags)
2127{
2128	u32 *cs;
2129
2130	cs = intel_ring_begin(rq, 2);
2131	if (IS_ERR(cs))
2132		return PTR_ERR(cs);
2133
2134	*cs++ = MI_BATCH_BUFFER_START | (dispatch_flags & I915_DISPATCH_SECURE ?
2135		0 : MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW);
2136	/* bit0-7 is the length on GEN6+ */
2137	*cs++ = offset;
2138	intel_ring_advance(rq, cs);
2139
2140	return 0;
2141}
2142
2143static int
2144gen6_emit_bb_start(struct i915_request *rq,
2145		   u64 offset, u32 len,
2146		   unsigned int dispatch_flags)
2147{
2148	u32 *cs;
2149
2150	cs = intel_ring_begin(rq, 2);
2151	if (IS_ERR(cs))
2152		return PTR_ERR(cs);
2153
2154	*cs++ = MI_BATCH_BUFFER_START | (dispatch_flags & I915_DISPATCH_SECURE ?
2155		0 : MI_BATCH_NON_SECURE_I965);
2156	/* bit0-7 is the length on GEN6+ */
2157	*cs++ = offset;
2158	intel_ring_advance(rq, cs);
2159
2160	return 0;
2161}
2162
2163/* Blitter support (SandyBridge+) */
2164
2165static int gen6_ring_flush(struct i915_request *rq, u32 mode)
2166{
2167	return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB);
2168}
2169
2170static void intel_ring_init_irq(struct drm_i915_private *dev_priv,
2171				struct intel_engine_cs *engine)
2172{
2173	if (INTEL_GEN(dev_priv) >= 6) {
2174		engine->irq_enable = gen6_irq_enable;
2175		engine->irq_disable = gen6_irq_disable;
2176	} else if (INTEL_GEN(dev_priv) >= 5) {
2177		engine->irq_enable = gen5_irq_enable;
2178		engine->irq_disable = gen5_irq_disable;
2179	} else if (INTEL_GEN(dev_priv) >= 3) {
2180		engine->irq_enable = i9xx_irq_enable;
2181		engine->irq_disable = i9xx_irq_disable;
2182	} else {
2183		engine->irq_enable = i8xx_irq_enable;
2184		engine->irq_disable = i8xx_irq_disable;
2185	}
2186}
2187
2188static void i9xx_set_default_submission(struct intel_engine_cs *engine)
2189{
2190	engine->submit_request = i9xx_submit_request;
2191	engine->cancel_requests = cancel_requests;
2192
2193	engine->park = NULL;
2194	engine->unpark = NULL;
2195}
2196
2197static void gen6_bsd_set_default_submission(struct intel_engine_cs *engine)
2198{
2199	i9xx_set_default_submission(engine);
2200	engine->submit_request = gen6_bsd_submit_request;
2201}
2202
2203static void intel_ring_default_vfuncs(struct drm_i915_private *dev_priv,
2204				      struct intel_engine_cs *engine)
2205{
2206	/* gen8+ are only supported with execlists */
2207	GEM_BUG_ON(INTEL_GEN(dev_priv) >= 8);
2208
2209	intel_ring_init_irq(dev_priv, engine);
2210
2211	engine->init_hw = init_ring_common;
2212	engine->reset.prepare = reset_prepare;
2213	engine->reset.reset = reset_ring;
2214	engine->reset.finish = reset_finish;
2215
2216	engine->cops = &ring_context_ops;
2217	engine->request_alloc = ring_request_alloc;
2218
2219	/*
2220	 * Using a global execution timeline; the previous final breadcrumb is
2221	 * equivalent to our next initial bread so we can elide
2222	 * engine->emit_init_breadcrumb().
2223	 */
2224	engine->emit_fini_breadcrumb = i9xx_emit_breadcrumb;
2225	if (IS_GEN(dev_priv, 5))
2226		engine->emit_fini_breadcrumb = gen5_emit_breadcrumb;
2227
2228	engine->set_default_submission = i9xx_set_default_submission;
2229
2230	if (INTEL_GEN(dev_priv) >= 6)
2231		engine->emit_bb_start = gen6_emit_bb_start;
2232	else if (INTEL_GEN(dev_priv) >= 4)
2233		engine->emit_bb_start = i965_emit_bb_start;
2234	else if (IS_I830(dev_priv) || IS_I845G(dev_priv))
2235		engine->emit_bb_start = i830_emit_bb_start;
2236	else
2237		engine->emit_bb_start = i915_emit_bb_start;
2238}
2239
2240int intel_init_render_ring_buffer(struct intel_engine_cs *engine)
2241{
2242	struct drm_i915_private *dev_priv = engine->i915;
2243	int ret;
2244
2245	intel_ring_default_vfuncs(dev_priv, engine);
2246
2247	if (HAS_L3_DPF(dev_priv))
2248		engine->irq_keep_mask = GT_RENDER_L3_PARITY_ERROR_INTERRUPT;
2249
2250	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT;
2251
2252	if (INTEL_GEN(dev_priv) >= 7) {
2253		engine->init_context = intel_rcs_ctx_init;
2254		engine->emit_flush = gen7_render_ring_flush;
2255		engine->emit_fini_breadcrumb = gen7_rcs_emit_breadcrumb;
2256	} else if (IS_GEN(dev_priv, 6)) {
2257		engine->init_context = intel_rcs_ctx_init;
2258		engine->emit_flush = gen6_render_ring_flush;
2259		engine->emit_fini_breadcrumb = gen6_rcs_emit_breadcrumb;
2260	} else if (IS_GEN(dev_priv, 5)) {
2261		engine->emit_flush = gen4_render_ring_flush;
2262	} else {
2263		if (INTEL_GEN(dev_priv) < 4)
2264			engine->emit_flush = gen2_render_ring_flush;
2265		else
2266			engine->emit_flush = gen4_render_ring_flush;
2267		engine->irq_enable_mask = I915_USER_INTERRUPT;
2268	}
2269
2270	if (IS_HASWELL(dev_priv))
2271		engine->emit_bb_start = hsw_emit_bb_start;
2272
2273	engine->init_hw = init_render_ring;
2274
2275	ret = intel_init_ring_buffer(engine);
2276	if (ret)
2277		return ret;
2278
2279	return 0;
2280}
2281
2282int intel_init_bsd_ring_buffer(struct intel_engine_cs *engine)
2283{
2284	struct drm_i915_private *dev_priv = engine->i915;
2285
2286	intel_ring_default_vfuncs(dev_priv, engine);
2287
2288	if (INTEL_GEN(dev_priv) >= 6) {
2289		/* gen6 bsd needs a special wa for tail updates */
2290		if (IS_GEN(dev_priv, 6))
2291			engine->set_default_submission = gen6_bsd_set_default_submission;
2292		engine->emit_flush = gen6_bsd_ring_flush;
2293		engine->irq_enable_mask = GT_BSD_USER_INTERRUPT;
2294
2295		if (IS_GEN(dev_priv, 6))
2296			engine->emit_fini_breadcrumb = gen6_xcs_emit_breadcrumb;
2297		else
2298			engine->emit_fini_breadcrumb = gen7_xcs_emit_breadcrumb;
2299	} else {
2300		engine->emit_flush = bsd_ring_flush;
2301		if (IS_GEN(dev_priv, 5))
2302			engine->irq_enable_mask = ILK_BSD_USER_INTERRUPT;
2303		else
2304			engine->irq_enable_mask = I915_BSD_USER_INTERRUPT;
2305	}
2306
2307	return intel_init_ring_buffer(engine);
2308}
2309
2310int intel_init_blt_ring_buffer(struct intel_engine_cs *engine)
2311{
2312	struct drm_i915_private *dev_priv = engine->i915;
2313
2314	GEM_BUG_ON(INTEL_GEN(dev_priv) < 6);
2315
2316	intel_ring_default_vfuncs(dev_priv, engine);
2317
2318	engine->emit_flush = gen6_ring_flush;
2319	engine->irq_enable_mask = GT_BLT_USER_INTERRUPT;
2320
2321	if (IS_GEN(dev_priv, 6))
2322		engine->emit_fini_breadcrumb = gen6_xcs_emit_breadcrumb;
2323	else
2324		engine->emit_fini_breadcrumb = gen7_xcs_emit_breadcrumb;
2325
2326	return intel_init_ring_buffer(engine);
2327}
2328
2329int intel_init_vebox_ring_buffer(struct intel_engine_cs *engine)
2330{
2331	struct drm_i915_private *dev_priv = engine->i915;
2332
2333	GEM_BUG_ON(INTEL_GEN(dev_priv) < 7);
2334
2335	intel_ring_default_vfuncs(dev_priv, engine);
2336
2337	engine->emit_flush = gen6_ring_flush;
2338	engine->irq_enable_mask = PM_VEBOX_USER_INTERRUPT;
2339	engine->irq_enable = hsw_vebox_irq_enable;
2340	engine->irq_disable = hsw_vebox_irq_disable;
2341
2342	engine->emit_fini_breadcrumb = gen7_xcs_emit_breadcrumb;
2343
2344	return intel_init_ring_buffer(engine);
2345}