drivers/gpu/drm/i915/intel_ringbuffer.c at v4.14

tjh.dev / kernel
fork atom
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork atom
kernel / drivers / gpu / drm / i915 / intel_ringbuffer.c
at v4.14 2284 lines 61 kB view raw
wrap content
   1/*
   2 * Copyright © 2008-2010 Intel Corporation
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * Authors:
  24 *    Eric Anholt <eric@anholt.net>
  25 *    Zou Nan hai <nanhai.zou@intel.com>
  26 *    Xiang Hai hao<haihao.xiang@intel.com>
  27 *
  28 */
  29
  30#include <linux/log2.h>
  31#include <drm/drmP.h>
  32#include "i915_drv.h"
  33#include <drm/i915_drm.h>
  34#include "i915_trace.h"
  35#include "intel_drv.h"
  36
  37/* Rough estimate of the typical request size, performing a flush,
  38 * set-context and then emitting the batch.
  39 */
  40#define LEGACY_REQUEST_SIZE 200
  41
  42static unsigned int __intel_ring_space(unsigned int head,
  43				       unsigned int tail,
  44				       unsigned int size)
  45{
  46	/*
  47	 * "If the Ring Buffer Head Pointer and the Tail Pointer are on the
  48	 * same cacheline, the Head Pointer must not be greater than the Tail
  49	 * Pointer."
  50	 */
  51	GEM_BUG_ON(!is_power_of_2(size));
  52	return (head - tail - CACHELINE_BYTES) & (size - 1);
  53}
  54
  55unsigned int intel_ring_update_space(struct intel_ring *ring)
  56{
  57	unsigned int space;
  58
  59	space = __intel_ring_space(ring->head, ring->emit, ring->size);
  60
  61	ring->space = space;
  62	return space;
  63}
  64
  65static int
  66gen2_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
  67{
  68	u32 cmd, *cs;
  69
  70	cmd = MI_FLUSH;
  71
  72	if (mode & EMIT_INVALIDATE)
  73		cmd |= MI_READ_FLUSH;
  74
  75	cs = intel_ring_begin(req, 2);
  76	if (IS_ERR(cs))
  77		return PTR_ERR(cs);
  78
  79	*cs++ = cmd;
  80	*cs++ = MI_NOOP;
  81	intel_ring_advance(req, cs);
  82
  83	return 0;
  84}
  85
  86static int
  87gen4_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
  88{
  89	u32 cmd, *cs;
  90
  91	/*
  92	 * read/write caches:
  93	 *
  94	 * I915_GEM_DOMAIN_RENDER is always invalidated, but is
  95	 * only flushed if MI_NO_WRITE_FLUSH is unset.  On 965, it is
  96	 * also flushed at 2d versus 3d pipeline switches.
  97	 *
  98	 * read-only caches:
  99	 *
 100	 * I915_GEM_DOMAIN_SAMPLER is flushed on pre-965 if
 101	 * MI_READ_FLUSH is set, and is always flushed on 965.
 102	 *
 103	 * I915_GEM_DOMAIN_COMMAND may not exist?
 104	 *
 105	 * I915_GEM_DOMAIN_INSTRUCTION, which exists on 965, is
 106	 * invalidated when MI_EXE_FLUSH is set.
 107	 *
 108	 * I915_GEM_DOMAIN_VERTEX, which exists on 965, is
 109	 * invalidated with every MI_FLUSH.
 110	 *
 111	 * TLBs:
 112	 *
 113	 * On 965, TLBs associated with I915_GEM_DOMAIN_COMMAND
 114	 * and I915_GEM_DOMAIN_CPU in are invalidated at PTE write and
 115	 * I915_GEM_DOMAIN_RENDER and I915_GEM_DOMAIN_SAMPLER
 116	 * are flushed at any MI_FLUSH.
 117	 */
 118
 119	cmd = MI_FLUSH;
 120	if (mode & EMIT_INVALIDATE) {
 121		cmd |= MI_EXE_FLUSH;
 122		if (IS_G4X(req->i915) || IS_GEN5(req->i915))
 123			cmd |= MI_INVALIDATE_ISP;
 124	}
 125
 126	cs = intel_ring_begin(req, 2);
 127	if (IS_ERR(cs))
 128		return PTR_ERR(cs);
 129
 130	*cs++ = cmd;
 131	*cs++ = MI_NOOP;
 132	intel_ring_advance(req, cs);
 133
 134	return 0;
 135}
 136
 137/**
 138 * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
 139 * implementing two workarounds on gen6.  From section 1.4.7.1
 140 * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
 141 *
 142 * [DevSNB-C+{W/A}] Before any depth stall flush (including those
 143 * produced by non-pipelined state commands), software needs to first
 144 * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
 145 * 0.
 146 *
 147 * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
 148 * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
 149 *
 150 * And the workaround for these two requires this workaround first:
 151 *
 152 * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
 153 * BEFORE the pipe-control with a post-sync op and no write-cache
 154 * flushes.
 155 *
 156 * And this last workaround is tricky because of the requirements on
 157 * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
 158 * volume 2 part 1:
 159 *
 160 *     "1 of the following must also be set:
 161 *      - Render Target Cache Flush Enable ([12] of DW1)
 162 *      - Depth Cache Flush Enable ([0] of DW1)
 163 *      - Stall at Pixel Scoreboard ([1] of DW1)
 164 *      - Depth Stall ([13] of DW1)
 165 *      - Post-Sync Operation ([13] of DW1)
 166 *      - Notify Enable ([8] of DW1)"
 167 *
 168 * The cache flushes require the workaround flush that triggered this
 169 * one, so we can't use it.  Depth stall would trigger the same.
 170 * Post-sync nonzero is what triggered this second workaround, so we
 171 * can't use that one either.  Notify enable is IRQs, which aren't
 172 * really our business.  That leaves only stall at scoreboard.
 173 */
 174static int
 175intel_emit_post_sync_nonzero_flush(struct drm_i915_gem_request *req)
 176{
 177	u32 scratch_addr =
 178		i915_ggtt_offset(req->engine->scratch) + 2 * CACHELINE_BYTES;
 179	u32 *cs;
 180
 181	cs = intel_ring_begin(req, 6);
 182	if (IS_ERR(cs))
 183		return PTR_ERR(cs);
 184
 185	*cs++ = GFX_OP_PIPE_CONTROL(5);
 186	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
 187	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
 188	*cs++ = 0; /* low dword */
 189	*cs++ = 0; /* high dword */
 190	*cs++ = MI_NOOP;
 191	intel_ring_advance(req, cs);
 192
 193	cs = intel_ring_begin(req, 6);
 194	if (IS_ERR(cs))
 195		return PTR_ERR(cs);
 196
 197	*cs++ = GFX_OP_PIPE_CONTROL(5);
 198	*cs++ = PIPE_CONTROL_QW_WRITE;
 199	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
 200	*cs++ = 0;
 201	*cs++ = 0;
 202	*cs++ = MI_NOOP;
 203	intel_ring_advance(req, cs);
 204
 205	return 0;
 206}
 207
 208static int
 209gen6_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 210{
 211	u32 scratch_addr =
 212		i915_ggtt_offset(req->engine->scratch) + 2 * CACHELINE_BYTES;
 213	u32 *cs, flags = 0;
 214	int ret;
 215
 216	/* Force SNB workarounds for PIPE_CONTROL flushes */
 217	ret = intel_emit_post_sync_nonzero_flush(req);
 218	if (ret)
 219		return ret;
 220
 221	/* Just flush everything.  Experiments have shown that reducing the
 222	 * number of bits based on the write domains has little performance
 223	 * impact.
 224	 */
 225	if (mode & EMIT_FLUSH) {
 226		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
 227		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
 228		/*
 229		 * Ensure that any following seqno writes only happen
 230		 * when the render cache is indeed flushed.
 231		 */
 232		flags |= PIPE_CONTROL_CS_STALL;
 233	}
 234	if (mode & EMIT_INVALIDATE) {
 235		flags |= PIPE_CONTROL_TLB_INVALIDATE;
 236		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
 237		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 238		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 239		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
 240		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
 241		/*
 242		 * TLB invalidate requires a post-sync write.
 243		 */
 244		flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
 245	}
 246
 247	cs = intel_ring_begin(req, 4);
 248	if (IS_ERR(cs))
 249		return PTR_ERR(cs);
 250
 251	*cs++ = GFX_OP_PIPE_CONTROL(4);
 252	*cs++ = flags;
 253	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
 254	*cs++ = 0;
 255	intel_ring_advance(req, cs);
 256
 257	return 0;
 258}
 259
 260static int
 261gen7_render_ring_cs_stall_wa(struct drm_i915_gem_request *req)
 262{
 263	u32 *cs;
 264
 265	cs = intel_ring_begin(req, 4);
 266	if (IS_ERR(cs))
 267		return PTR_ERR(cs);
 268
 269	*cs++ = GFX_OP_PIPE_CONTROL(4);
 270	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
 271	*cs++ = 0;
 272	*cs++ = 0;
 273	intel_ring_advance(req, cs);
 274
 275	return 0;
 276}
 277
 278static int
 279gen7_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 280{
 281	u32 scratch_addr =
 282		i915_ggtt_offset(req->engine->scratch) + 2 * CACHELINE_BYTES;
 283	u32 *cs, flags = 0;
 284
 285	/*
 286	 * Ensure that any following seqno writes only happen when the render
 287	 * cache is indeed flushed.
 288	 *
 289	 * Workaround: 4th PIPE_CONTROL command (except the ones with only
 290	 * read-cache invalidate bits set) must have the CS_STALL bit set. We
 291	 * don't try to be clever and just set it unconditionally.
 292	 */
 293	flags |= PIPE_CONTROL_CS_STALL;
 294
 295	/* Just flush everything.  Experiments have shown that reducing the
 296	 * number of bits based on the write domains has little performance
 297	 * impact.
 298	 */
 299	if (mode & EMIT_FLUSH) {
 300		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
 301		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
 302		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
 303		flags |= PIPE_CONTROL_FLUSH_ENABLE;
 304	}
 305	if (mode & EMIT_INVALIDATE) {
 306		flags |= PIPE_CONTROL_TLB_INVALIDATE;
 307		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
 308		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 309		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 310		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
 311		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
 312		flags |= PIPE_CONTROL_MEDIA_STATE_CLEAR;
 313		/*
 314		 * TLB invalidate requires a post-sync write.
 315		 */
 316		flags |= PIPE_CONTROL_QW_WRITE;
 317		flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
 318
 319		flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
 320
 321		/* Workaround: we must issue a pipe_control with CS-stall bit
 322		 * set before a pipe_control command that has the state cache
 323		 * invalidate bit set. */
 324		gen7_render_ring_cs_stall_wa(req);
 325	}
 326
 327	cs = intel_ring_begin(req, 4);
 328	if (IS_ERR(cs))
 329		return PTR_ERR(cs);
 330
 331	*cs++ = GFX_OP_PIPE_CONTROL(4);
 332	*cs++ = flags;
 333	*cs++ = scratch_addr;
 334	*cs++ = 0;
 335	intel_ring_advance(req, cs);
 336
 337	return 0;
 338}
 339
 340static int
 341gen8_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 342{
 343	u32 flags;
 344	u32 *cs;
 345
 346	cs = intel_ring_begin(req, mode & EMIT_INVALIDATE ? 12 : 6);
 347	if (IS_ERR(cs))
 348		return PTR_ERR(cs);
 349
 350	flags = PIPE_CONTROL_CS_STALL;
 351
 352	if (mode & EMIT_FLUSH) {
 353		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
 354		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
 355		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
 356		flags |= PIPE_CONTROL_FLUSH_ENABLE;
 357	}
 358	if (mode & EMIT_INVALIDATE) {
 359		flags |= PIPE_CONTROL_TLB_INVALIDATE;
 360		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
 361		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 362		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 363		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
 364		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
 365		flags |= PIPE_CONTROL_QW_WRITE;
 366		flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
 367
 368		/* WaCsStallBeforeStateCacheInvalidate:bdw,chv */
 369		cs = gen8_emit_pipe_control(cs,
 370					    PIPE_CONTROL_CS_STALL |
 371					    PIPE_CONTROL_STALL_AT_SCOREBOARD,
 372					    0);
 373	}
 374
 375	cs = gen8_emit_pipe_control(cs, flags,
 376				    i915_ggtt_offset(req->engine->scratch) +
 377				    2 * CACHELINE_BYTES);
 378
 379	intel_ring_advance(req, cs);
 380
 381	return 0;
 382}
 383
 384static void ring_setup_phys_status_page(struct intel_engine_cs *engine)
 385{
 386	struct drm_i915_private *dev_priv = engine->i915;
 387	u32 addr;
 388
 389	addr = dev_priv->status_page_dmah->busaddr;
 390	if (INTEL_GEN(dev_priv) >= 4)
 391		addr |= (dev_priv->status_page_dmah->busaddr >> 28) & 0xf0;
 392	I915_WRITE(HWS_PGA, addr);
 393}
 394
 395static void intel_ring_setup_status_page(struct intel_engine_cs *engine)
 396{
 397	struct drm_i915_private *dev_priv = engine->i915;
 398	i915_reg_t mmio;
 399
 400	/* The ring status page addresses are no longer next to the rest of
 401	 * the ring registers as of gen7.
 402	 */
 403	if (IS_GEN7(dev_priv)) {
 404		switch (engine->id) {
 405		case RCS:
 406			mmio = RENDER_HWS_PGA_GEN7;
 407			break;
 408		case BCS:
 409			mmio = BLT_HWS_PGA_GEN7;
 410			break;
 411		/*
 412		 * VCS2 actually doesn't exist on Gen7. Only shut up
 413		 * gcc switch check warning
 414		 */
 415		case VCS2:
 416		case VCS:
 417			mmio = BSD_HWS_PGA_GEN7;
 418			break;
 419		case VECS:
 420			mmio = VEBOX_HWS_PGA_GEN7;
 421			break;
 422		}
 423	} else if (IS_GEN6(dev_priv)) {
 424		mmio = RING_HWS_PGA_GEN6(engine->mmio_base);
 425	} else {
 426		/* XXX: gen8 returns to sanity */
 427		mmio = RING_HWS_PGA(engine->mmio_base);
 428	}
 429
 430	I915_WRITE(mmio, engine->status_page.ggtt_offset);
 431	POSTING_READ(mmio);
 432
 433	/*
 434	 * Flush the TLB for this page
 435	 *
 436	 * FIXME: These two bits have disappeared on gen8, so a question
 437	 * arises: do we still need this and if so how should we go about
 438	 * invalidating the TLB?
 439	 */
 440	if (IS_GEN(dev_priv, 6, 7)) {
 441		i915_reg_t reg = RING_INSTPM(engine->mmio_base);
 442
 443		/* ring should be idle before issuing a sync flush*/
 444		WARN_ON((I915_READ_MODE(engine) & MODE_IDLE) == 0);
 445
 446		I915_WRITE(reg,
 447			   _MASKED_BIT_ENABLE(INSTPM_TLB_INVALIDATE |
 448					      INSTPM_SYNC_FLUSH));
 449		if (intel_wait_for_register(dev_priv,
 450					    reg, INSTPM_SYNC_FLUSH, 0,
 451					    1000))
 452			DRM_ERROR("%s: wait for SyncFlush to complete for TLB invalidation timed out\n",
 453				  engine->name);
 454	}
 455}
 456
 457static bool stop_ring(struct intel_engine_cs *engine)
 458{
 459	struct drm_i915_private *dev_priv = engine->i915;
 460
 461	if (INTEL_GEN(dev_priv) > 2) {
 462		I915_WRITE_MODE(engine, _MASKED_BIT_ENABLE(STOP_RING));
 463		if (intel_wait_for_register(dev_priv,
 464					    RING_MI_MODE(engine->mmio_base),
 465					    MODE_IDLE,
 466					    MODE_IDLE,
 467					    1000)) {
 468			DRM_ERROR("%s : timed out trying to stop ring\n",
 469				  engine->name);
 470			/* Sometimes we observe that the idle flag is not
 471			 * set even though the ring is empty. So double
 472			 * check before giving up.
 473			 */
 474			if (I915_READ_HEAD(engine) != I915_READ_TAIL(engine))
 475				return false;
 476		}
 477	}
 478
 479	I915_WRITE_CTL(engine, 0);
 480	I915_WRITE_HEAD(engine, 0);
 481	I915_WRITE_TAIL(engine, 0);
 482
 483	if (INTEL_GEN(dev_priv) > 2) {
 484		(void)I915_READ_CTL(engine);
 485		I915_WRITE_MODE(engine, _MASKED_BIT_DISABLE(STOP_RING));
 486	}
 487
 488	return (I915_READ_HEAD(engine) & HEAD_ADDR) == 0;
 489}
 490
 491static int init_ring_common(struct intel_engine_cs *engine)
 492{
 493	struct drm_i915_private *dev_priv = engine->i915;
 494	struct intel_ring *ring = engine->buffer;
 495	int ret = 0;
 496
 497	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
 498
 499	if (!stop_ring(engine)) {
 500		/* G45 ring initialization often fails to reset head to zero */
 501		DRM_DEBUG_KMS("%s head not reset to zero "
 502			      "ctl %08x head %08x tail %08x start %08x\n",
 503			      engine->name,
 504			      I915_READ_CTL(engine),
 505			      I915_READ_HEAD(engine),
 506			      I915_READ_TAIL(engine),
 507			      I915_READ_START(engine));
 508
 509		if (!stop_ring(engine)) {
 510			DRM_ERROR("failed to set %s head to zero "
 511				  "ctl %08x head %08x tail %08x start %08x\n",
 512				  engine->name,
 513				  I915_READ_CTL(engine),
 514				  I915_READ_HEAD(engine),
 515				  I915_READ_TAIL(engine),
 516				  I915_READ_START(engine));
 517			ret = -EIO;
 518			goto out;
 519		}
 520	}
 521
 522	if (HWS_NEEDS_PHYSICAL(dev_priv))
 523		ring_setup_phys_status_page(engine);
 524	else
 525		intel_ring_setup_status_page(engine);
 526
 527	intel_engine_reset_breadcrumbs(engine);
 528
 529	/* Enforce ordering by reading HEAD register back */
 530	I915_READ_HEAD(engine);
 531
 532	/* Initialize the ring. This must happen _after_ we've cleared the ring
 533	 * registers with the above sequence (the readback of the HEAD registers
 534	 * also enforces ordering), otherwise the hw might lose the new ring
 535	 * register values. */
 536	I915_WRITE_START(engine, i915_ggtt_offset(ring->vma));
 537
 538	/* WaClearRingBufHeadRegAtInit:ctg,elk */
 539	if (I915_READ_HEAD(engine))
 540		DRM_DEBUG("%s initialization failed [head=%08x], fudging\n",
 541			  engine->name, I915_READ_HEAD(engine));
 542
 543	intel_ring_update_space(ring);
 544	I915_WRITE_HEAD(engine, ring->head);
 545	I915_WRITE_TAIL(engine, ring->tail);
 546	(void)I915_READ_TAIL(engine);
 547
 548	I915_WRITE_CTL(engine, RING_CTL_SIZE(ring->size) | RING_VALID);
 549
 550	/* If the head is still not zero, the ring is dead */
 551	if (intel_wait_for_register(dev_priv, RING_CTL(engine->mmio_base),
 552				    RING_VALID, RING_VALID,
 553				    50)) {
 554		DRM_ERROR("%s initialization failed "
 555			  "ctl %08x (valid? %d) head %08x [%08x] tail %08x [%08x] start %08x [expected %08x]\n",
 556			  engine->name,
 557			  I915_READ_CTL(engine),
 558			  I915_READ_CTL(engine) & RING_VALID,
 559			  I915_READ_HEAD(engine), ring->head,
 560			  I915_READ_TAIL(engine), ring->tail,
 561			  I915_READ_START(engine),
 562			  i915_ggtt_offset(ring->vma));
 563		ret = -EIO;
 564		goto out;
 565	}
 566
 567	intel_engine_init_hangcheck(engine);
 568
 569out:
 570	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
 571
 572	return ret;
 573}
 574
 575static void reset_ring_common(struct intel_engine_cs *engine,
 576			      struct drm_i915_gem_request *request)
 577{
 578	/* Try to restore the logical GPU state to match the continuation
 579	 * of the request queue. If we skip the context/PD restore, then
 580	 * the next request may try to execute assuming that its context
 581	 * is valid and loaded on the GPU and so may try to access invalid
 582	 * memory, prompting repeated GPU hangs.
 583	 *
 584	 * If the request was guilty, we still restore the logical state
 585	 * in case the next request requires it (e.g. the aliasing ppgtt),
 586	 * but skip over the hung batch.
 587	 *
 588	 * If the request was innocent, we try to replay the request with
 589	 * the restored context.
 590	 */
 591	if (request) {
 592		struct drm_i915_private *dev_priv = request->i915;
 593		struct intel_context *ce = &request->ctx->engine[engine->id];
 594		struct i915_hw_ppgtt *ppgtt;
 595
 596		/* FIXME consider gen8 reset */
 597
 598		if (ce->state) {
 599			I915_WRITE(CCID,
 600				   i915_ggtt_offset(ce->state) |
 601				   BIT(8) /* must be set! */ |
 602				   CCID_EXTENDED_STATE_SAVE |
 603				   CCID_EXTENDED_STATE_RESTORE |
 604				   CCID_EN);
 605		}
 606
 607		ppgtt = request->ctx->ppgtt ?: engine->i915->mm.aliasing_ppgtt;
 608		if (ppgtt) {
 609			u32 pd_offset = ppgtt->pd.base.ggtt_offset << 10;
 610
 611			I915_WRITE(RING_PP_DIR_DCLV(engine), PP_DIR_DCLV_2G);
 612			I915_WRITE(RING_PP_DIR_BASE(engine), pd_offset);
 613
 614			/* Wait for the PD reload to complete */
 615			if (intel_wait_for_register(dev_priv,
 616						    RING_PP_DIR_BASE(engine),
 617						    BIT(0), 0,
 618						    10))
 619				DRM_ERROR("Wait for reload of ppgtt page-directory timed out\n");
 620
 621			ppgtt->pd_dirty_rings &= ~intel_engine_flag(engine);
 622		}
 623
 624		/* If the rq hung, jump to its breadcrumb and skip the batch */
 625		if (request->fence.error == -EIO)
 626			request->ring->head = request->postfix;
 627	} else {
 628		engine->legacy_active_context = NULL;
 629	}
 630}
 631
 632static int intel_rcs_ctx_init(struct drm_i915_gem_request *req)
 633{
 634	int ret;
 635
 636	ret = intel_ring_workarounds_emit(req);
 637	if (ret != 0)
 638		return ret;
 639
 640	ret = i915_gem_render_state_emit(req);
 641	if (ret)
 642		return ret;
 643
 644	return 0;
 645}
 646
 647static int init_render_ring(struct intel_engine_cs *engine)
 648{
 649	struct drm_i915_private *dev_priv = engine->i915;
 650	int ret = init_ring_common(engine);
 651	if (ret)
 652		return ret;
 653
 654	/* WaTimedSingleVertexDispatch:cl,bw,ctg,elk,ilk,snb */
 655	if (IS_GEN(dev_priv, 4, 6))
 656		I915_WRITE(MI_MODE, _MASKED_BIT_ENABLE(VS_TIMER_DISPATCH));
 657
 658	/* We need to disable the AsyncFlip performance optimisations in order
 659	 * to use MI_WAIT_FOR_EVENT within the CS. It should already be
 660	 * programmed to '1' on all products.
 661	 *
 662	 * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv
 663	 */
 664	if (IS_GEN(dev_priv, 6, 7))
 665		I915_WRITE(MI_MODE, _MASKED_BIT_ENABLE(ASYNC_FLIP_PERF_DISABLE));
 666
 667	/* Required for the hardware to program scanline values for waiting */
 668	/* WaEnableFlushTlbInvalidationMode:snb */
 669	if (IS_GEN6(dev_priv))
 670		I915_WRITE(GFX_MODE,
 671			   _MASKED_BIT_ENABLE(GFX_TLB_INVALIDATE_EXPLICIT));
 672
 673	/* WaBCSVCSTlbInvalidationMode:ivb,vlv,hsw */
 674	if (IS_GEN7(dev_priv))
 675		I915_WRITE(GFX_MODE_GEN7,
 676			   _MASKED_BIT_ENABLE(GFX_TLB_INVALIDATE_EXPLICIT) |
 677			   _MASKED_BIT_ENABLE(GFX_REPLAY_MODE));
 678
 679	if (IS_GEN6(dev_priv)) {
 680		/* From the Sandybridge PRM, volume 1 part 3, page 24:
 681		 * "If this bit is set, STCunit will have LRA as replacement
 682		 *  policy. [...] This bit must be reset.  LRA replacement
 683		 *  policy is not supported."
 684		 */
 685		I915_WRITE(CACHE_MODE_0,
 686			   _MASKED_BIT_DISABLE(CM0_STC_EVICT_DISABLE_LRA_SNB));
 687	}
 688
 689	if (IS_GEN(dev_priv, 6, 7))
 690		I915_WRITE(INSTPM, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING));
 691
 692	if (INTEL_INFO(dev_priv)->gen >= 6)
 693		I915_WRITE_IMR(engine, ~engine->irq_keep_mask);
 694
 695	return init_workarounds_ring(engine);
 696}
 697
 698static void render_ring_cleanup(struct intel_engine_cs *engine)
 699{
 700	struct drm_i915_private *dev_priv = engine->i915;
 701
 702	i915_vma_unpin_and_release(&dev_priv->semaphore);
 703}
 704
 705static u32 *gen8_rcs_signal(struct drm_i915_gem_request *req, u32 *cs)
 706{
 707	struct drm_i915_private *dev_priv = req->i915;
 708	struct intel_engine_cs *waiter;
 709	enum intel_engine_id id;
 710
 711	for_each_engine(waiter, dev_priv, id) {
 712		u64 gtt_offset = req->engine->semaphore.signal_ggtt[id];
 713		if (gtt_offset == MI_SEMAPHORE_SYNC_INVALID)
 714			continue;
 715
 716		*cs++ = GFX_OP_PIPE_CONTROL(6);
 717		*cs++ = PIPE_CONTROL_GLOBAL_GTT_IVB | PIPE_CONTROL_QW_WRITE |
 718			PIPE_CONTROL_CS_STALL;
 719		*cs++ = lower_32_bits(gtt_offset);
 720		*cs++ = upper_32_bits(gtt_offset);
 721		*cs++ = req->global_seqno;
 722		*cs++ = 0;
 723		*cs++ = MI_SEMAPHORE_SIGNAL |
 724			MI_SEMAPHORE_TARGET(waiter->hw_id);
 725		*cs++ = 0;
 726	}
 727
 728	return cs;
 729}
 730
 731static u32 *gen8_xcs_signal(struct drm_i915_gem_request *req, u32 *cs)
 732{
 733	struct drm_i915_private *dev_priv = req->i915;
 734	struct intel_engine_cs *waiter;
 735	enum intel_engine_id id;
 736
 737	for_each_engine(waiter, dev_priv, id) {
 738		u64 gtt_offset = req->engine->semaphore.signal_ggtt[id];
 739		if (gtt_offset == MI_SEMAPHORE_SYNC_INVALID)
 740			continue;
 741
 742		*cs++ = (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW;
 743		*cs++ = lower_32_bits(gtt_offset) | MI_FLUSH_DW_USE_GTT;
 744		*cs++ = upper_32_bits(gtt_offset);
 745		*cs++ = req->global_seqno;
 746		*cs++ = MI_SEMAPHORE_SIGNAL |
 747			MI_SEMAPHORE_TARGET(waiter->hw_id);
 748		*cs++ = 0;
 749	}
 750
 751	return cs;
 752}
 753
 754static u32 *gen6_signal(struct drm_i915_gem_request *req, u32 *cs)
 755{
 756	struct drm_i915_private *dev_priv = req->i915;
 757	struct intel_engine_cs *engine;
 758	enum intel_engine_id id;
 759	int num_rings = 0;
 760
 761	for_each_engine(engine, dev_priv, id) {
 762		i915_reg_t mbox_reg;
 763
 764		if (!(BIT(engine->hw_id) & GEN6_SEMAPHORES_MASK))
 765			continue;
 766
 767		mbox_reg = req->engine->semaphore.mbox.signal[engine->hw_id];
 768		if (i915_mmio_reg_valid(mbox_reg)) {
 769			*cs++ = MI_LOAD_REGISTER_IMM(1);
 770			*cs++ = i915_mmio_reg_offset(mbox_reg);
 771			*cs++ = req->global_seqno;
 772			num_rings++;
 773		}
 774	}
 775	if (num_rings & 1)
 776		*cs++ = MI_NOOP;
 777
 778	return cs;
 779}
 780
 781static void i9xx_submit_request(struct drm_i915_gem_request *request)
 782{
 783	struct drm_i915_private *dev_priv = request->i915;
 784
 785	i915_gem_request_submit(request);
 786
 787	I915_WRITE_TAIL(request->engine,
 788			intel_ring_set_tail(request->ring, request->tail));
 789}
 790
 791static void i9xx_emit_breadcrumb(struct drm_i915_gem_request *req, u32 *cs)
 792{
 793	*cs++ = MI_STORE_DWORD_INDEX;
 794	*cs++ = I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT;
 795	*cs++ = req->global_seqno;
 796	*cs++ = MI_USER_INTERRUPT;
 797
 798	req->tail = intel_ring_offset(req, cs);
 799	assert_ring_tail_valid(req->ring, req->tail);
 800}
 801
 802static const int i9xx_emit_breadcrumb_sz = 4;
 803
 804/**
 805 * gen6_sema_emit_breadcrumb - Update the semaphore mailbox registers
 806 *
 807 * @request - request to write to the ring
 808 *
 809 * Update the mailbox registers in the *other* rings with the current seqno.
 810 * This acts like a signal in the canonical semaphore.
 811 */
 812static void gen6_sema_emit_breadcrumb(struct drm_i915_gem_request *req, u32 *cs)
 813{
 814	return i9xx_emit_breadcrumb(req,
 815				    req->engine->semaphore.signal(req, cs));
 816}
 817
 818static void gen8_render_emit_breadcrumb(struct drm_i915_gem_request *req,
 819					u32 *cs)
 820{
 821	struct intel_engine_cs *engine = req->engine;
 822
 823	if (engine->semaphore.signal)
 824		cs = engine->semaphore.signal(req, cs);
 825
 826	*cs++ = GFX_OP_PIPE_CONTROL(6);
 827	*cs++ = PIPE_CONTROL_GLOBAL_GTT_IVB | PIPE_CONTROL_CS_STALL |
 828		PIPE_CONTROL_QW_WRITE;
 829	*cs++ = intel_hws_seqno_address(engine);
 830	*cs++ = 0;
 831	*cs++ = req->global_seqno;
 832	/* We're thrashing one dword of HWS. */
 833	*cs++ = 0;
 834	*cs++ = MI_USER_INTERRUPT;
 835	*cs++ = MI_NOOP;
 836
 837	req->tail = intel_ring_offset(req, cs);
 838	assert_ring_tail_valid(req->ring, req->tail);
 839}
 840
 841static const int gen8_render_emit_breadcrumb_sz = 8;
 842
 843/**
 844 * intel_ring_sync - sync the waiter to the signaller on seqno
 845 *
 846 * @waiter - ring that is waiting
 847 * @signaller - ring which has, or will signal
 848 * @seqno - seqno which the waiter will block on
 849 */
 850
 851static int
 852gen8_ring_sync_to(struct drm_i915_gem_request *req,
 853		  struct drm_i915_gem_request *signal)
 854{
 855	struct drm_i915_private *dev_priv = req->i915;
 856	u64 offset = GEN8_WAIT_OFFSET(req->engine, signal->engine->id);
 857	struct i915_hw_ppgtt *ppgtt;
 858	u32 *cs;
 859
 860	cs = intel_ring_begin(req, 4);
 861	if (IS_ERR(cs))
 862		return PTR_ERR(cs);
 863
 864	*cs++ = MI_SEMAPHORE_WAIT | MI_SEMAPHORE_GLOBAL_GTT |
 865		MI_SEMAPHORE_SAD_GTE_SDD;
 866	*cs++ = signal->global_seqno;
 867	*cs++ = lower_32_bits(offset);
 868	*cs++ = upper_32_bits(offset);
 869	intel_ring_advance(req, cs);
 870
 871	/* When the !RCS engines idle waiting upon a semaphore, they lose their
 872	 * pagetables and we must reload them before executing the batch.
 873	 * We do this on the i915_switch_context() following the wait and
 874	 * before the dispatch.
 875	 */
 876	ppgtt = req->ctx->ppgtt;
 877	if (ppgtt && req->engine->id != RCS)
 878		ppgtt->pd_dirty_rings |= intel_engine_flag(req->engine);
 879	return 0;
 880}
 881
 882static int
 883gen6_ring_sync_to(struct drm_i915_gem_request *req,
 884		  struct drm_i915_gem_request *signal)
 885{
 886	u32 dw1 = MI_SEMAPHORE_MBOX |
 887		  MI_SEMAPHORE_COMPARE |
 888		  MI_SEMAPHORE_REGISTER;
 889	u32 wait_mbox = signal->engine->semaphore.mbox.wait[req->engine->hw_id];
 890	u32 *cs;
 891
 892	WARN_ON(wait_mbox == MI_SEMAPHORE_SYNC_INVALID);
 893
 894	cs = intel_ring_begin(req, 4);
 895	if (IS_ERR(cs))
 896		return PTR_ERR(cs);
 897
 898	*cs++ = dw1 | wait_mbox;
 899	/* Throughout all of the GEM code, seqno passed implies our current
 900	 * seqno is >= the last seqno executed. However for hardware the
 901	 * comparison is strictly greater than.
 902	 */
 903	*cs++ = signal->global_seqno - 1;
 904	*cs++ = 0;
 905	*cs++ = MI_NOOP;
 906	intel_ring_advance(req, cs);
 907
 908	return 0;
 909}
 910
 911static void
 912gen5_seqno_barrier(struct intel_engine_cs *engine)
 913{
 914	/* MI_STORE are internally buffered by the GPU and not flushed
 915	 * either by MI_FLUSH or SyncFlush or any other combination of
 916	 * MI commands.
 917	 *
 918	 * "Only the submission of the store operation is guaranteed.
 919	 * The write result will be complete (coherent) some time later
 920	 * (this is practically a finite period but there is no guaranteed
 921	 * latency)."
 922	 *
 923	 * Empirically, we observe that we need a delay of at least 75us to
 924	 * be sure that the seqno write is visible by the CPU.
 925	 */
 926	usleep_range(125, 250);
 927}
 928
 929static void
 930gen6_seqno_barrier(struct intel_engine_cs *engine)
 931{
 932	struct drm_i915_private *dev_priv = engine->i915;
 933
 934	/* Workaround to force correct ordering between irq and seqno writes on
 935	 * ivb (and maybe also on snb) by reading from a CS register (like
 936	 * ACTHD) before reading the status page.
 937	 *
 938	 * Note that this effectively stalls the read by the time it takes to
 939	 * do a memory transaction, which more or less ensures that the write
 940	 * from the GPU has sufficient time to invalidate the CPU cacheline.
 941	 * Alternatively we could delay the interrupt from the CS ring to give
 942	 * the write time to land, but that would incur a delay after every
 943	 * batch i.e. much more frequent than a delay when waiting for the
 944	 * interrupt (with the same net latency).
 945	 *
 946	 * Also note that to prevent whole machine hangs on gen7, we have to
 947	 * take the spinlock to guard against concurrent cacheline access.
 948	 */
 949	spin_lock_irq(&dev_priv->uncore.lock);
 950	POSTING_READ_FW(RING_ACTHD(engine->mmio_base));
 951	spin_unlock_irq(&dev_priv->uncore.lock);
 952}
 953
 954static void
 955gen5_irq_enable(struct intel_engine_cs *engine)
 956{
 957	gen5_enable_gt_irq(engine->i915, engine->irq_enable_mask);
 958}
 959
 960static void
 961gen5_irq_disable(struct intel_engine_cs *engine)
 962{
 963	gen5_disable_gt_irq(engine->i915, engine->irq_enable_mask);
 964}
 965
 966static void
 967i9xx_irq_enable(struct intel_engine_cs *engine)
 968{
 969	struct drm_i915_private *dev_priv = engine->i915;
 970
 971	dev_priv->irq_mask &= ~engine->irq_enable_mask;
 972	I915_WRITE(IMR, dev_priv->irq_mask);
 973	POSTING_READ_FW(RING_IMR(engine->mmio_base));
 974}
 975
 976static void
 977i9xx_irq_disable(struct intel_engine_cs *engine)
 978{
 979	struct drm_i915_private *dev_priv = engine->i915;
 980
 981	dev_priv->irq_mask |= engine->irq_enable_mask;
 982	I915_WRITE(IMR, dev_priv->irq_mask);
 983}
 984
 985static void
 986i8xx_irq_enable(struct intel_engine_cs *engine)
 987{
 988	struct drm_i915_private *dev_priv = engine->i915;
 989
 990	dev_priv->irq_mask &= ~engine->irq_enable_mask;
 991	I915_WRITE16(IMR, dev_priv->irq_mask);
 992	POSTING_READ16(RING_IMR(engine->mmio_base));
 993}
 994
 995static void
 996i8xx_irq_disable(struct intel_engine_cs *engine)
 997{
 998	struct drm_i915_private *dev_priv = engine->i915;
 999
1000	dev_priv->irq_mask |= engine->irq_enable_mask;
1001	I915_WRITE16(IMR, dev_priv->irq_mask);
1002}
1003
1004static int
1005bsd_ring_flush(struct drm_i915_gem_request *req, u32 mode)
1006{
1007	u32 *cs;
1008
1009	cs = intel_ring_begin(req, 2);
1010	if (IS_ERR(cs))
1011		return PTR_ERR(cs);
1012
1013	*cs++ = MI_FLUSH;
1014	*cs++ = MI_NOOP;
1015	intel_ring_advance(req, cs);
1016	return 0;
1017}
1018
1019static void
1020gen6_irq_enable(struct intel_engine_cs *engine)
1021{
1022	struct drm_i915_private *dev_priv = engine->i915;
1023
1024	I915_WRITE_IMR(engine,
1025		       ~(engine->irq_enable_mask |
1026			 engine->irq_keep_mask));
1027	gen5_enable_gt_irq(dev_priv, engine->irq_enable_mask);
1028}
1029
1030static void
1031gen6_irq_disable(struct intel_engine_cs *engine)
1032{
1033	struct drm_i915_private *dev_priv = engine->i915;
1034
1035	I915_WRITE_IMR(engine, ~engine->irq_keep_mask);
1036	gen5_disable_gt_irq(dev_priv, engine->irq_enable_mask);
1037}
1038
1039static void
1040hsw_vebox_irq_enable(struct intel_engine_cs *engine)
1041{
1042	struct drm_i915_private *dev_priv = engine->i915;
1043
1044	I915_WRITE_IMR(engine, ~engine->irq_enable_mask);
1045	gen6_unmask_pm_irq(dev_priv, engine->irq_enable_mask);
1046}
1047
1048static void
1049hsw_vebox_irq_disable(struct intel_engine_cs *engine)
1050{
1051	struct drm_i915_private *dev_priv = engine->i915;
1052
1053	I915_WRITE_IMR(engine, ~0);
1054	gen6_mask_pm_irq(dev_priv, engine->irq_enable_mask);
1055}
1056
1057static void
1058gen8_irq_enable(struct intel_engine_cs *engine)
1059{
1060	struct drm_i915_private *dev_priv = engine->i915;
1061
1062	I915_WRITE_IMR(engine,
1063		       ~(engine->irq_enable_mask |
1064			 engine->irq_keep_mask));
1065	POSTING_READ_FW(RING_IMR(engine->mmio_base));
1066}
1067
1068static void
1069gen8_irq_disable(struct intel_engine_cs *engine)
1070{
1071	struct drm_i915_private *dev_priv = engine->i915;
1072
1073	I915_WRITE_IMR(engine, ~engine->irq_keep_mask);
1074}
1075
1076static int
1077i965_emit_bb_start(struct drm_i915_gem_request *req,
1078		   u64 offset, u32 length,
1079		   unsigned int dispatch_flags)
1080{
1081	u32 *cs;
1082
1083	cs = intel_ring_begin(req, 2);
1084	if (IS_ERR(cs))
1085		return PTR_ERR(cs);
1086
1087	*cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT | (dispatch_flags &
1088		I915_DISPATCH_SECURE ? 0 : MI_BATCH_NON_SECURE_I965);
1089	*cs++ = offset;
1090	intel_ring_advance(req, cs);
1091
1092	return 0;
1093}
1094
1095/* Just userspace ABI convention to limit the wa batch bo to a resonable size */
1096#define I830_BATCH_LIMIT (256*1024)
1097#define I830_TLB_ENTRIES (2)
1098#define I830_WA_SIZE max(I830_TLB_ENTRIES*4096, I830_BATCH_LIMIT)
1099static int
1100i830_emit_bb_start(struct drm_i915_gem_request *req,
1101		   u64 offset, u32 len,
1102		   unsigned int dispatch_flags)
1103{
1104	u32 *cs, cs_offset = i915_ggtt_offset(req->engine->scratch);
1105
1106	cs = intel_ring_begin(req, 6);
1107	if (IS_ERR(cs))
1108		return PTR_ERR(cs);
1109
1110	/* Evict the invalid PTE TLBs */
1111	*cs++ = COLOR_BLT_CMD | BLT_WRITE_RGBA;
1112	*cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | 4096;
1113	*cs++ = I830_TLB_ENTRIES << 16 | 4; /* load each page */
1114	*cs++ = cs_offset;
1115	*cs++ = 0xdeadbeef;
1116	*cs++ = MI_NOOP;
1117	intel_ring_advance(req, cs);
1118
1119	if ((dispatch_flags & I915_DISPATCH_PINNED) == 0) {
1120		if (len > I830_BATCH_LIMIT)
1121			return -ENOSPC;
1122
1123		cs = intel_ring_begin(req, 6 + 2);
1124		if (IS_ERR(cs))
1125			return PTR_ERR(cs);
1126
1127		/* Blit the batch (which has now all relocs applied) to the
1128		 * stable batch scratch bo area (so that the CS never
1129		 * stumbles over its tlb invalidation bug) ...
1130		 */
1131		*cs++ = SRC_COPY_BLT_CMD | BLT_WRITE_RGBA;
1132		*cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | 4096;
1133		*cs++ = DIV_ROUND_UP(len, 4096) << 16 | 4096;
1134		*cs++ = cs_offset;
1135		*cs++ = 4096;
1136		*cs++ = offset;
1137
1138		*cs++ = MI_FLUSH;
1139		*cs++ = MI_NOOP;
1140		intel_ring_advance(req, cs);
1141
1142		/* ... and execute it. */
1143		offset = cs_offset;
1144	}
1145
1146	cs = intel_ring_begin(req, 2);
1147	if (IS_ERR(cs))
1148		return PTR_ERR(cs);
1149
1150	*cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
1151	*cs++ = offset | (dispatch_flags & I915_DISPATCH_SECURE ? 0 :
1152		MI_BATCH_NON_SECURE);
1153	intel_ring_advance(req, cs);
1154
1155	return 0;
1156}
1157
1158static int
1159i915_emit_bb_start(struct drm_i915_gem_request *req,
1160		   u64 offset, u32 len,
1161		   unsigned int dispatch_flags)
1162{
1163	u32 *cs;
1164
1165	cs = intel_ring_begin(req, 2);
1166	if (IS_ERR(cs))
1167		return PTR_ERR(cs);
1168
1169	*cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
1170	*cs++ = offset | (dispatch_flags & I915_DISPATCH_SECURE ? 0 :
1171		MI_BATCH_NON_SECURE);
1172	intel_ring_advance(req, cs);
1173
1174	return 0;
1175}
1176
1177static void cleanup_phys_status_page(struct intel_engine_cs *engine)
1178{
1179	struct drm_i915_private *dev_priv = engine->i915;
1180
1181	if (!dev_priv->status_page_dmah)
1182		return;
1183
1184	drm_pci_free(&dev_priv->drm, dev_priv->status_page_dmah);
1185	engine->status_page.page_addr = NULL;
1186}
1187
1188static void cleanup_status_page(struct intel_engine_cs *engine)
1189{
1190	struct i915_vma *vma;
1191	struct drm_i915_gem_object *obj;
1192
1193	vma = fetch_and_zero(&engine->status_page.vma);
1194	if (!vma)
1195		return;
1196
1197	obj = vma->obj;
1198
1199	i915_vma_unpin(vma);
1200	i915_vma_close(vma);
1201
1202	i915_gem_object_unpin_map(obj);
1203	__i915_gem_object_release_unless_active(obj);
1204}
1205
1206static int init_status_page(struct intel_engine_cs *engine)
1207{
1208	struct drm_i915_gem_object *obj;
1209	struct i915_vma *vma;
1210	unsigned int flags;
1211	void *vaddr;
1212	int ret;
1213
1214	obj = i915_gem_object_create_internal(engine->i915, PAGE_SIZE);
1215	if (IS_ERR(obj)) {
1216		DRM_ERROR("Failed to allocate status page\n");
1217		return PTR_ERR(obj);
1218	}
1219
1220	ret = i915_gem_object_set_cache_level(obj, I915_CACHE_LLC);
1221	if (ret)
1222		goto err;
1223
1224	vma = i915_vma_instance(obj, &engine->i915->ggtt.base, NULL);
1225	if (IS_ERR(vma)) {
1226		ret = PTR_ERR(vma);
1227		goto err;
1228	}
1229
1230	flags = PIN_GLOBAL;
1231	if (!HAS_LLC(engine->i915))
1232		/* On g33, we cannot place HWS above 256MiB, so
1233		 * restrict its pinning to the low mappable arena.
1234		 * Though this restriction is not documented for
1235		 * gen4, gen5, or byt, they also behave similarly
1236		 * and hang if the HWS is placed at the top of the
1237		 * GTT. To generalise, it appears that all !llc
1238		 * platforms have issues with us placing the HWS
1239		 * above the mappable region (even though we never
1240		 * actualy map it).
1241		 */
1242		flags |= PIN_MAPPABLE;
1243	ret = i915_vma_pin(vma, 0, 4096, flags);
1244	if (ret)
1245		goto err;
1246
1247	vaddr = i915_gem_object_pin_map(obj, I915_MAP_WB);
1248	if (IS_ERR(vaddr)) {
1249		ret = PTR_ERR(vaddr);
1250		goto err_unpin;
1251	}
1252
1253	engine->status_page.vma = vma;
1254	engine->status_page.ggtt_offset = i915_ggtt_offset(vma);
1255	engine->status_page.page_addr = memset(vaddr, 0, PAGE_SIZE);
1256
1257	DRM_DEBUG_DRIVER("%s hws offset: 0x%08x\n",
1258			 engine->name, i915_ggtt_offset(vma));
1259	return 0;
1260
1261err_unpin:
1262	i915_vma_unpin(vma);
1263err:
1264	i915_gem_object_put(obj);
1265	return ret;
1266}
1267
1268static int init_phys_status_page(struct intel_engine_cs *engine)
1269{
1270	struct drm_i915_private *dev_priv = engine->i915;
1271
1272	GEM_BUG_ON(engine->id != RCS);
1273
1274	dev_priv->status_page_dmah =
1275		drm_pci_alloc(&dev_priv->drm, PAGE_SIZE, PAGE_SIZE);
1276	if (!dev_priv->status_page_dmah)
1277		return -ENOMEM;
1278
1279	engine->status_page.page_addr = dev_priv->status_page_dmah->vaddr;
1280	memset(engine->status_page.page_addr, 0, PAGE_SIZE);
1281
1282	return 0;
1283}
1284
1285int intel_ring_pin(struct intel_ring *ring,
1286		   struct drm_i915_private *i915,
1287		   unsigned int offset_bias)
1288{
1289	enum i915_map_type map = HAS_LLC(i915) ? I915_MAP_WB : I915_MAP_WC;
1290	struct i915_vma *vma = ring->vma;
1291	unsigned int flags;
1292	void *addr;
1293	int ret;
1294
1295	GEM_BUG_ON(ring->vaddr);
1296
1297
1298	flags = PIN_GLOBAL;
1299	if (offset_bias)
1300		flags |= PIN_OFFSET_BIAS | offset_bias;
1301	if (vma->obj->stolen)
1302		flags |= PIN_MAPPABLE;
1303
1304	if (!(vma->flags & I915_VMA_GLOBAL_BIND)) {
1305		if (flags & PIN_MAPPABLE || map == I915_MAP_WC)
1306			ret = i915_gem_object_set_to_gtt_domain(vma->obj, true);
1307		else
1308			ret = i915_gem_object_set_to_cpu_domain(vma->obj, true);
1309		if (unlikely(ret))
1310			return ret;
1311	}
1312
1313	ret = i915_vma_pin(vma, 0, PAGE_SIZE, flags);
1314	if (unlikely(ret))
1315		return ret;
1316
1317	if (i915_vma_is_map_and_fenceable(vma))
1318		addr = (void __force *)i915_vma_pin_iomap(vma);
1319	else
1320		addr = i915_gem_object_pin_map(vma->obj, map);
1321	if (IS_ERR(addr))
1322		goto err;
1323
1324	ring->vaddr = addr;
1325	return 0;
1326
1327err:
1328	i915_vma_unpin(vma);
1329	return PTR_ERR(addr);
1330}
1331
1332void intel_ring_reset(struct intel_ring *ring, u32 tail)
1333{
1334	GEM_BUG_ON(!list_empty(&ring->request_list));
1335	ring->tail = tail;
1336	ring->head = tail;
1337	ring->emit = tail;
1338	intel_ring_update_space(ring);
1339}
1340
1341void intel_ring_unpin(struct intel_ring *ring)
1342{
1343	GEM_BUG_ON(!ring->vma);
1344	GEM_BUG_ON(!ring->vaddr);
1345
1346	/* Discard any unused bytes beyond that submitted to hw. */
1347	intel_ring_reset(ring, ring->tail);
1348
1349	if (i915_vma_is_map_and_fenceable(ring->vma))
1350		i915_vma_unpin_iomap(ring->vma);
1351	else
1352		i915_gem_object_unpin_map(ring->vma->obj);
1353	ring->vaddr = NULL;
1354
1355	i915_vma_unpin(ring->vma);
1356}
1357
1358static struct i915_vma *
1359intel_ring_create_vma(struct drm_i915_private *dev_priv, int size)
1360{
1361	struct drm_i915_gem_object *obj;
1362	struct i915_vma *vma;
1363
1364	obj = i915_gem_object_create_stolen(dev_priv, size);
1365	if (!obj)
1366		obj = i915_gem_object_create_internal(dev_priv, size);
1367	if (IS_ERR(obj))
1368		return ERR_CAST(obj);
1369
1370	/* mark ring buffers as read-only from GPU side by default */
1371	obj->gt_ro = 1;
1372
1373	vma = i915_vma_instance(obj, &dev_priv->ggtt.base, NULL);
1374	if (IS_ERR(vma))
1375		goto err;
1376
1377	return vma;
1378
1379err:
1380	i915_gem_object_put(obj);
1381	return vma;
1382}
1383
1384struct intel_ring *
1385intel_engine_create_ring(struct intel_engine_cs *engine, int size)
1386{
1387	struct intel_ring *ring;
1388	struct i915_vma *vma;
1389
1390	GEM_BUG_ON(!is_power_of_2(size));
1391	GEM_BUG_ON(RING_CTL_SIZE(size) & ~RING_NR_PAGES);
1392
1393	ring = kzalloc(sizeof(*ring), GFP_KERNEL);
1394	if (!ring)
1395		return ERR_PTR(-ENOMEM);
1396
1397	INIT_LIST_HEAD(&ring->request_list);
1398
1399	ring->size = size;
1400	/* Workaround an erratum on the i830 which causes a hang if
1401	 * the TAIL pointer points to within the last 2 cachelines
1402	 * of the buffer.
1403	 */
1404	ring->effective_size = size;
1405	if (IS_I830(engine->i915) || IS_I845G(engine->i915))
1406		ring->effective_size -= 2 * CACHELINE_BYTES;
1407
1408	intel_ring_update_space(ring);
1409
1410	vma = intel_ring_create_vma(engine->i915, size);
1411	if (IS_ERR(vma)) {
1412		kfree(ring);
1413		return ERR_CAST(vma);
1414	}
1415	ring->vma = vma;
1416
1417	return ring;
1418}
1419
1420void
1421intel_ring_free(struct intel_ring *ring)
1422{
1423	struct drm_i915_gem_object *obj = ring->vma->obj;
1424
1425	i915_vma_close(ring->vma);
1426	__i915_gem_object_release_unless_active(obj);
1427
1428	kfree(ring);
1429}
1430
1431static int context_pin(struct i915_gem_context *ctx)
1432{
1433	struct i915_vma *vma = ctx->engine[RCS].state;
1434	int ret;
1435
1436	/* Clear this page out of any CPU caches for coherent swap-in/out.
1437	 * We only want to do this on the first bind so that we do not stall
1438	 * on an active context (which by nature is already on the GPU).
1439	 */
1440	if (!(vma->flags & I915_VMA_GLOBAL_BIND)) {
1441		ret = i915_gem_object_set_to_gtt_domain(vma->obj, false);
1442		if (ret)
1443			return ret;
1444	}
1445
1446	return i915_vma_pin(vma, 0, I915_GTT_MIN_ALIGNMENT,
1447			    PIN_GLOBAL | PIN_HIGH);
1448}
1449
1450static struct i915_vma *
1451alloc_context_vma(struct intel_engine_cs *engine)
1452{
1453	struct drm_i915_private *i915 = engine->i915;
1454	struct drm_i915_gem_object *obj;
1455	struct i915_vma *vma;
1456
1457	obj = i915_gem_object_create(i915, engine->context_size);
1458	if (IS_ERR(obj))
1459		return ERR_CAST(obj);
1460
1461	/*
1462	 * Try to make the context utilize L3 as well as LLC.
1463	 *
1464	 * On VLV we don't have L3 controls in the PTEs so we
1465	 * shouldn't touch the cache level, especially as that
1466	 * would make the object snooped which might have a
1467	 * negative performance impact.
1468	 *
1469	 * Snooping is required on non-llc platforms in execlist
1470	 * mode, but since all GGTT accesses use PAT entry 0 we
1471	 * get snooping anyway regardless of cache_level.
1472	 *
1473	 * This is only applicable for Ivy Bridge devices since
1474	 * later platforms don't have L3 control bits in the PTE.
1475	 */
1476	if (IS_IVYBRIDGE(i915)) {
1477		/* Ignore any error, regard it as a simple optimisation */
1478		i915_gem_object_set_cache_level(obj, I915_CACHE_L3_LLC);
1479	}
1480
1481	vma = i915_vma_instance(obj, &i915->ggtt.base, NULL);
1482	if (IS_ERR(vma))
1483		i915_gem_object_put(obj);
1484
1485	return vma;
1486}
1487
1488static struct intel_ring *
1489intel_ring_context_pin(struct intel_engine_cs *engine,
1490		       struct i915_gem_context *ctx)
1491{
1492	struct intel_context *ce = &ctx->engine[engine->id];
1493	int ret;
1494
1495	lockdep_assert_held(&ctx->i915->drm.struct_mutex);
1496
1497	if (likely(ce->pin_count++))
1498		goto out;
1499	GEM_BUG_ON(!ce->pin_count); /* no overflow please! */
1500
1501	if (!ce->state && engine->context_size) {
1502		struct i915_vma *vma;
1503
1504		vma = alloc_context_vma(engine);
1505		if (IS_ERR(vma)) {
1506			ret = PTR_ERR(vma);
1507			goto err;
1508		}
1509
1510		ce->state = vma;
1511	}
1512
1513	if (ce->state) {
1514		ret = context_pin(ctx);
1515		if (ret)
1516			goto err;
1517
1518		ce->state->obj->mm.dirty = true;
1519	}
1520
1521	/* The kernel context is only used as a placeholder for flushing the
1522	 * active context. It is never used for submitting user rendering and
1523	 * as such never requires the golden render context, and so we can skip
1524	 * emitting it when we switch to the kernel context. This is required
1525	 * as during eviction we cannot allocate and pin the renderstate in
1526	 * order to initialise the context.
1527	 */
1528	if (i915_gem_context_is_kernel(ctx))
1529		ce->initialised = true;
1530
1531	i915_gem_context_get(ctx);
1532
1533out:
1534	/* One ringbuffer to rule them all */
1535	return engine->buffer;
1536
1537err:
1538	ce->pin_count = 0;
1539	return ERR_PTR(ret);
1540}
1541
1542static void intel_ring_context_unpin(struct intel_engine_cs *engine,
1543				     struct i915_gem_context *ctx)
1544{
1545	struct intel_context *ce = &ctx->engine[engine->id];
1546
1547	lockdep_assert_held(&ctx->i915->drm.struct_mutex);
1548	GEM_BUG_ON(ce->pin_count == 0);
1549
1550	if (--ce->pin_count)
1551		return;
1552
1553	if (ce->state)
1554		i915_vma_unpin(ce->state);
1555
1556	i915_gem_context_put(ctx);
1557}
1558
1559static int intel_init_ring_buffer(struct intel_engine_cs *engine)
1560{
1561	struct intel_ring *ring;
1562	int err;
1563
1564	intel_engine_setup_common(engine);
1565
1566	err = intel_engine_init_common(engine);
1567	if (err)
1568		goto err;
1569
1570	if (HWS_NEEDS_PHYSICAL(engine->i915))
1571		err = init_phys_status_page(engine);
1572	else
1573		err = init_status_page(engine);
1574	if (err)
1575		goto err;
1576
1577	ring = intel_engine_create_ring(engine, 32 * PAGE_SIZE);
1578	if (IS_ERR(ring)) {
1579		err = PTR_ERR(ring);
1580		goto err_hws;
1581	}
1582
1583	/* Ring wraparound at offset 0 sometimes hangs. No idea why. */
1584	err = intel_ring_pin(ring, engine->i915, I915_GTT_PAGE_SIZE);
1585	if (err)
1586		goto err_ring;
1587
1588	GEM_BUG_ON(engine->buffer);
1589	engine->buffer = ring;
1590
1591	return 0;
1592
1593err_ring:
1594	intel_ring_free(ring);
1595err_hws:
1596	if (HWS_NEEDS_PHYSICAL(engine->i915))
1597		cleanup_phys_status_page(engine);
1598	else
1599		cleanup_status_page(engine);
1600err:
1601	intel_engine_cleanup_common(engine);
1602	return err;
1603}
1604
1605void intel_engine_cleanup(struct intel_engine_cs *engine)
1606{
1607	struct drm_i915_private *dev_priv = engine->i915;
1608
1609	WARN_ON(INTEL_GEN(dev_priv) > 2 &&
1610		(I915_READ_MODE(engine) & MODE_IDLE) == 0);
1611
1612	intel_ring_unpin(engine->buffer);
1613	intel_ring_free(engine->buffer);
1614
1615	if (engine->cleanup)
1616		engine->cleanup(engine);
1617
1618	if (HWS_NEEDS_PHYSICAL(dev_priv))
1619		cleanup_phys_status_page(engine);
1620	else
1621		cleanup_status_page(engine);
1622
1623	intel_engine_cleanup_common(engine);
1624
1625	dev_priv->engine[engine->id] = NULL;
1626	kfree(engine);
1627}
1628
1629void intel_legacy_submission_resume(struct drm_i915_private *dev_priv)
1630{
1631	struct intel_engine_cs *engine;
1632	enum intel_engine_id id;
1633
1634	/* Restart from the beginning of the rings for convenience */
1635	for_each_engine(engine, dev_priv, id)
1636		intel_ring_reset(engine->buffer, 0);
1637}
1638
1639static int ring_request_alloc(struct drm_i915_gem_request *request)
1640{
1641	u32 *cs;
1642
1643	GEM_BUG_ON(!request->ctx->engine[request->engine->id].pin_count);
1644
1645	/* Flush enough space to reduce the likelihood of waiting after
1646	 * we start building the request - in which case we will just
1647	 * have to repeat work.
1648	 */
1649	request->reserved_space += LEGACY_REQUEST_SIZE;
1650
1651	cs = intel_ring_begin(request, 0);
1652	if (IS_ERR(cs))
1653		return PTR_ERR(cs);
1654
1655	request->reserved_space -= LEGACY_REQUEST_SIZE;
1656	return 0;
1657}
1658
1659static noinline int wait_for_space(struct drm_i915_gem_request *req,
1660				   unsigned int bytes)
1661{
1662	struct intel_ring *ring = req->ring;
1663	struct drm_i915_gem_request *target;
1664	long timeout;
1665
1666	lockdep_assert_held(&req->i915->drm.struct_mutex);
1667
1668	if (intel_ring_update_space(ring) >= bytes)
1669		return 0;
1670
1671	/*
1672	 * Space is reserved in the ringbuffer for finalising the request,
1673	 * as that cannot be allowed to fail. During request finalisation,
1674	 * reserved_space is set to 0 to stop the overallocation and the
1675	 * assumption is that then we never need to wait (which has the
1676	 * risk of failing with EINTR).
1677	 *
1678	 * See also i915_gem_request_alloc() and i915_add_request().
1679	 */
1680	GEM_BUG_ON(!req->reserved_space);
1681
1682	list_for_each_entry(target, &ring->request_list, ring_link) {
1683		/* Would completion of this request free enough space? */
1684		if (bytes <= __intel_ring_space(target->postfix,
1685						ring->emit, ring->size))
1686			break;
1687	}
1688
1689	if (WARN_ON(&target->ring_link == &ring->request_list))
1690		return -ENOSPC;
1691
1692	timeout = i915_wait_request(target,
1693				    I915_WAIT_INTERRUPTIBLE | I915_WAIT_LOCKED,
1694				    MAX_SCHEDULE_TIMEOUT);
1695	if (timeout < 0)
1696		return timeout;
1697
1698	i915_gem_request_retire_upto(target);
1699
1700	intel_ring_update_space(ring);
1701	GEM_BUG_ON(ring->space < bytes);
1702	return 0;
1703}
1704
1705u32 *intel_ring_begin(struct drm_i915_gem_request *req,
1706		      unsigned int num_dwords)
1707{
1708	struct intel_ring *ring = req->ring;
1709	const unsigned int remain_usable = ring->effective_size - ring->emit;
1710	const unsigned int bytes = num_dwords * sizeof(u32);
1711	unsigned int need_wrap = 0;
1712	unsigned int total_bytes;
1713	u32 *cs;
1714
1715	/* Packets must be qword aligned. */
1716	GEM_BUG_ON(num_dwords & 1);
1717
1718	total_bytes = bytes + req->reserved_space;
1719	GEM_BUG_ON(total_bytes > ring->effective_size);
1720
1721	if (unlikely(total_bytes > remain_usable)) {
1722		const int remain_actual = ring->size - ring->emit;
1723
1724		if (bytes > remain_usable) {
1725			/*
1726			 * Not enough space for the basic request. So need to
1727			 * flush out the remainder and then wait for
1728			 * base + reserved.
1729			 */
1730			total_bytes += remain_actual;
1731			need_wrap = remain_actual | 1;
1732		} else  {
1733			/*
1734			 * The base request will fit but the reserved space
1735			 * falls off the end. So we don't need an immediate
1736			 * wrap and only need to effectively wait for the
1737			 * reserved size from the start of ringbuffer.
1738			 */
1739			total_bytes = req->reserved_space + remain_actual;
1740		}
1741	}
1742
1743	if (unlikely(total_bytes > ring->space)) {
1744		int ret = wait_for_space(req, total_bytes);
1745		if (unlikely(ret))
1746			return ERR_PTR(ret);
1747	}
1748
1749	if (unlikely(need_wrap)) {
1750		need_wrap &= ~1;
1751		GEM_BUG_ON(need_wrap > ring->space);
1752		GEM_BUG_ON(ring->emit + need_wrap > ring->size);
1753
1754		/* Fill the tail with MI_NOOP */
1755		memset(ring->vaddr + ring->emit, 0, need_wrap);
1756		ring->emit = 0;
1757		ring->space -= need_wrap;
1758	}
1759
1760	GEM_BUG_ON(ring->emit > ring->size - bytes);
1761	GEM_BUG_ON(ring->space < bytes);
1762	cs = ring->vaddr + ring->emit;
1763	GEM_DEBUG_EXEC(memset(cs, POISON_INUSE, bytes));
1764	ring->emit += bytes;
1765	ring->space -= bytes;
1766
1767	return cs;
1768}
1769
1770/* Align the ring tail to a cacheline boundary */
1771int intel_ring_cacheline_align(struct drm_i915_gem_request *req)
1772{
1773	int num_dwords =
1774		(req->ring->emit & (CACHELINE_BYTES - 1)) / sizeof(uint32_t);
1775	u32 *cs;
1776
1777	if (num_dwords == 0)
1778		return 0;
1779
1780	num_dwords = CACHELINE_BYTES / sizeof(uint32_t) - num_dwords;
1781	cs = intel_ring_begin(req, num_dwords);
1782	if (IS_ERR(cs))
1783		return PTR_ERR(cs);
1784
1785	while (num_dwords--)
1786		*cs++ = MI_NOOP;
1787
1788	intel_ring_advance(req, cs);
1789
1790	return 0;
1791}
1792
1793static void gen6_bsd_submit_request(struct drm_i915_gem_request *request)
1794{
1795	struct drm_i915_private *dev_priv = request->i915;
1796
1797	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
1798
1799       /* Every tail move must follow the sequence below */
1800
1801	/* Disable notification that the ring is IDLE. The GT
1802	 * will then assume that it is busy and bring it out of rc6.
1803	 */
1804	I915_WRITE_FW(GEN6_BSD_SLEEP_PSMI_CONTROL,
1805		      _MASKED_BIT_ENABLE(GEN6_BSD_SLEEP_MSG_DISABLE));
1806
1807	/* Clear the context id. Here be magic! */
1808	I915_WRITE64_FW(GEN6_BSD_RNCID, 0x0);
1809
1810	/* Wait for the ring not to be idle, i.e. for it to wake up. */
1811	if (__intel_wait_for_register_fw(dev_priv,
1812					 GEN6_BSD_SLEEP_PSMI_CONTROL,
1813					 GEN6_BSD_SLEEP_INDICATOR,
1814					 0,
1815					 1000, 0, NULL))
1816		DRM_ERROR("timed out waiting for the BSD ring to wake up\n");
1817
1818	/* Now that the ring is fully powered up, update the tail */
1819	i9xx_submit_request(request);
1820
1821	/* Let the ring send IDLE messages to the GT again,
1822	 * and so let it sleep to conserve power when idle.
1823	 */
1824	I915_WRITE_FW(GEN6_BSD_SLEEP_PSMI_CONTROL,
1825		      _MASKED_BIT_DISABLE(GEN6_BSD_SLEEP_MSG_DISABLE));
1826
1827	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
1828}
1829
1830static int gen6_bsd_ring_flush(struct drm_i915_gem_request *req, u32 mode)
1831{
1832	u32 cmd, *cs;
1833
1834	cs = intel_ring_begin(req, 4);
1835	if (IS_ERR(cs))
1836		return PTR_ERR(cs);
1837
1838	cmd = MI_FLUSH_DW;
1839	if (INTEL_GEN(req->i915) >= 8)
1840		cmd += 1;
1841
1842	/* We always require a command barrier so that subsequent
1843	 * commands, such as breadcrumb interrupts, are strictly ordered
1844	 * wrt the contents of the write cache being flushed to memory
1845	 * (and thus being coherent from the CPU).
1846	 */
1847	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
1848
1849	/*
1850	 * Bspec vol 1c.5 - video engine command streamer:
1851	 * "If ENABLED, all TLBs will be invalidated once the flush
1852	 * operation is complete. This bit is only valid when the
1853	 * Post-Sync Operation field is a value of 1h or 3h."
1854	 */
1855	if (mode & EMIT_INVALIDATE)
1856		cmd |= MI_INVALIDATE_TLB | MI_INVALIDATE_BSD;
1857
1858	*cs++ = cmd;
1859	*cs++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
1860	if (INTEL_GEN(req->i915) >= 8) {
1861		*cs++ = 0; /* upper addr */
1862		*cs++ = 0; /* value */
1863	} else  {
1864		*cs++ = 0;
1865		*cs++ = MI_NOOP;
1866	}
1867	intel_ring_advance(req, cs);
1868	return 0;
1869}
1870
1871static int
1872gen8_emit_bb_start(struct drm_i915_gem_request *req,
1873		   u64 offset, u32 len,
1874		   unsigned int dispatch_flags)
1875{
1876	bool ppgtt = USES_PPGTT(req->i915) &&
1877			!(dispatch_flags & I915_DISPATCH_SECURE);
1878	u32 *cs;
1879
1880	cs = intel_ring_begin(req, 4);
1881	if (IS_ERR(cs))
1882		return PTR_ERR(cs);
1883
1884	/* FIXME(BDW): Address space and security selectors. */
1885	*cs++ = MI_BATCH_BUFFER_START_GEN8 | (ppgtt << 8) | (dispatch_flags &
1886		I915_DISPATCH_RS ? MI_BATCH_RESOURCE_STREAMER : 0);
1887	*cs++ = lower_32_bits(offset);
1888	*cs++ = upper_32_bits(offset);
1889	*cs++ = MI_NOOP;
1890	intel_ring_advance(req, cs);
1891
1892	return 0;
1893}
1894
1895static int
1896hsw_emit_bb_start(struct drm_i915_gem_request *req,
1897		  u64 offset, u32 len,
1898		  unsigned int dispatch_flags)
1899{
1900	u32 *cs;
1901
1902	cs = intel_ring_begin(req, 2);
1903	if (IS_ERR(cs))
1904		return PTR_ERR(cs);
1905
1906	*cs++ = MI_BATCH_BUFFER_START | (dispatch_flags & I915_DISPATCH_SECURE ?
1907		0 : MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW) |
1908		(dispatch_flags & I915_DISPATCH_RS ?
1909		MI_BATCH_RESOURCE_STREAMER : 0);
1910	/* bit0-7 is the length on GEN6+ */
1911	*cs++ = offset;
1912	intel_ring_advance(req, cs);
1913
1914	return 0;
1915}
1916
1917static int
1918gen6_emit_bb_start(struct drm_i915_gem_request *req,
1919		   u64 offset, u32 len,
1920		   unsigned int dispatch_flags)
1921{
1922	u32 *cs;
1923
1924	cs = intel_ring_begin(req, 2);
1925	if (IS_ERR(cs))
1926		return PTR_ERR(cs);
1927
1928	*cs++ = MI_BATCH_BUFFER_START | (dispatch_flags & I915_DISPATCH_SECURE ?
1929		0 : MI_BATCH_NON_SECURE_I965);
1930	/* bit0-7 is the length on GEN6+ */
1931	*cs++ = offset;
1932	intel_ring_advance(req, cs);
1933
1934	return 0;
1935}
1936
1937/* Blitter support (SandyBridge+) */
1938
1939static int gen6_ring_flush(struct drm_i915_gem_request *req, u32 mode)
1940{
1941	u32 cmd, *cs;
1942
1943	cs = intel_ring_begin(req, 4);
1944	if (IS_ERR(cs))
1945		return PTR_ERR(cs);
1946
1947	cmd = MI_FLUSH_DW;
1948	if (INTEL_GEN(req->i915) >= 8)
1949		cmd += 1;
1950
1951	/* We always require a command barrier so that subsequent
1952	 * commands, such as breadcrumb interrupts, are strictly ordered
1953	 * wrt the contents of the write cache being flushed to memory
1954	 * (and thus being coherent from the CPU).
1955	 */
1956	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
1957
1958	/*
1959	 * Bspec vol 1c.3 - blitter engine command streamer:
1960	 * "If ENABLED, all TLBs will be invalidated once the flush
1961	 * operation is complete. This bit is only valid when the
1962	 * Post-Sync Operation field is a value of 1h or 3h."
1963	 */
1964	if (mode & EMIT_INVALIDATE)
1965		cmd |= MI_INVALIDATE_TLB;
1966	*cs++ = cmd;
1967	*cs++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
1968	if (INTEL_GEN(req->i915) >= 8) {
1969		*cs++ = 0; /* upper addr */
1970		*cs++ = 0; /* value */
1971	} else  {
1972		*cs++ = 0;
1973		*cs++ = MI_NOOP;
1974	}
1975	intel_ring_advance(req, cs);
1976
1977	return 0;
1978}
1979
1980static void intel_ring_init_semaphores(struct drm_i915_private *dev_priv,
1981				       struct intel_engine_cs *engine)
1982{
1983	struct drm_i915_gem_object *obj;
1984	int ret, i;
1985
1986	if (!i915.semaphores)
1987		return;
1988
1989	if (INTEL_GEN(dev_priv) >= 8 && !dev_priv->semaphore) {
1990		struct i915_vma *vma;
1991
1992		obj = i915_gem_object_create(dev_priv, PAGE_SIZE);
1993		if (IS_ERR(obj))
1994			goto err;
1995
1996		vma = i915_vma_instance(obj, &dev_priv->ggtt.base, NULL);
1997		if (IS_ERR(vma))
1998			goto err_obj;
1999
2000		ret = i915_gem_object_set_to_gtt_domain(obj, false);
2001		if (ret)
2002			goto err_obj;
2003
2004		ret = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
2005		if (ret)
2006			goto err_obj;
2007
2008		dev_priv->semaphore = vma;
2009	}
2010
2011	if (INTEL_GEN(dev_priv) >= 8) {
2012		u32 offset = i915_ggtt_offset(dev_priv->semaphore);
2013
2014		engine->semaphore.sync_to = gen8_ring_sync_to;
2015		engine->semaphore.signal = gen8_xcs_signal;
2016
2017		for (i = 0; i < I915_NUM_ENGINES; i++) {
2018			u32 ring_offset;
2019
2020			if (i != engine->id)
2021				ring_offset = offset + GEN8_SEMAPHORE_OFFSET(engine->id, i);
2022			else
2023				ring_offset = MI_SEMAPHORE_SYNC_INVALID;
2024
2025			engine->semaphore.signal_ggtt[i] = ring_offset;
2026		}
2027	} else if (INTEL_GEN(dev_priv) >= 6) {
2028		engine->semaphore.sync_to = gen6_ring_sync_to;
2029		engine->semaphore.signal = gen6_signal;
2030
2031		/*
2032		 * The current semaphore is only applied on pre-gen8
2033		 * platform.  And there is no VCS2 ring on the pre-gen8
2034		 * platform. So the semaphore between RCS and VCS2 is
2035		 * initialized as INVALID.  Gen8 will initialize the
2036		 * sema between VCS2 and RCS later.
2037		 */
2038		for (i = 0; i < GEN6_NUM_SEMAPHORES; i++) {
2039			static const struct {
2040				u32 wait_mbox;
2041				i915_reg_t mbox_reg;
2042			} sem_data[GEN6_NUM_SEMAPHORES][GEN6_NUM_SEMAPHORES] = {
2043				[RCS_HW] = {
2044					[VCS_HW] =  { .wait_mbox = MI_SEMAPHORE_SYNC_RV,  .mbox_reg = GEN6_VRSYNC },
2045					[BCS_HW] =  { .wait_mbox = MI_SEMAPHORE_SYNC_RB,  .mbox_reg = GEN6_BRSYNC },
2046					[VECS_HW] = { .wait_mbox = MI_SEMAPHORE_SYNC_RVE, .mbox_reg = GEN6_VERSYNC },
2047				},
2048				[VCS_HW] = {
2049					[RCS_HW] =  { .wait_mbox = MI_SEMAPHORE_SYNC_VR,  .mbox_reg = GEN6_RVSYNC },
2050					[BCS_HW] =  { .wait_mbox = MI_SEMAPHORE_SYNC_VB,  .mbox_reg = GEN6_BVSYNC },
2051					[VECS_HW] = { .wait_mbox = MI_SEMAPHORE_SYNC_VVE, .mbox_reg = GEN6_VEVSYNC },
2052				},
2053				[BCS_HW] = {
2054					[RCS_HW] =  { .wait_mbox = MI_SEMAPHORE_SYNC_BR,  .mbox_reg = GEN6_RBSYNC },
2055					[VCS_HW] =  { .wait_mbox = MI_SEMAPHORE_SYNC_BV,  .mbox_reg = GEN6_VBSYNC },
2056					[VECS_HW] = { .wait_mbox = MI_SEMAPHORE_SYNC_BVE, .mbox_reg = GEN6_VEBSYNC },
2057				},
2058				[VECS_HW] = {
2059					[RCS_HW] =  { .wait_mbox = MI_SEMAPHORE_SYNC_VER, .mbox_reg = GEN6_RVESYNC },
2060					[VCS_HW] =  { .wait_mbox = MI_SEMAPHORE_SYNC_VEV, .mbox_reg = GEN6_VVESYNC },
2061					[BCS_HW] =  { .wait_mbox = MI_SEMAPHORE_SYNC_VEB, .mbox_reg = GEN6_BVESYNC },
2062				},
2063			};
2064			u32 wait_mbox;
2065			i915_reg_t mbox_reg;
2066
2067			if (i == engine->hw_id) {
2068				wait_mbox = MI_SEMAPHORE_SYNC_INVALID;
2069				mbox_reg = GEN6_NOSYNC;
2070			} else {
2071				wait_mbox = sem_data[engine->hw_id][i].wait_mbox;
2072				mbox_reg = sem_data[engine->hw_id][i].mbox_reg;
2073			}
2074
2075			engine->semaphore.mbox.wait[i] = wait_mbox;
2076			engine->semaphore.mbox.signal[i] = mbox_reg;
2077		}
2078	}
2079
2080	return;
2081
2082err_obj:
2083	i915_gem_object_put(obj);
2084err:
2085	DRM_DEBUG_DRIVER("Failed to allocate space for semaphores, disabling\n");
2086	i915.semaphores = 0;
2087}
2088
2089static void intel_ring_init_irq(struct drm_i915_private *dev_priv,
2090				struct intel_engine_cs *engine)
2091{
2092	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << engine->irq_shift;
2093
2094	if (INTEL_GEN(dev_priv) >= 8) {
2095		engine->irq_enable = gen8_irq_enable;
2096		engine->irq_disable = gen8_irq_disable;
2097		engine->irq_seqno_barrier = gen6_seqno_barrier;
2098	} else if (INTEL_GEN(dev_priv) >= 6) {
2099		engine->irq_enable = gen6_irq_enable;
2100		engine->irq_disable = gen6_irq_disable;
2101		engine->irq_seqno_barrier = gen6_seqno_barrier;
2102	} else if (INTEL_GEN(dev_priv) >= 5) {
2103		engine->irq_enable = gen5_irq_enable;
2104		engine->irq_disable = gen5_irq_disable;
2105		engine->irq_seqno_barrier = gen5_seqno_barrier;
2106	} else if (INTEL_GEN(dev_priv) >= 3) {
2107		engine->irq_enable = i9xx_irq_enable;
2108		engine->irq_disable = i9xx_irq_disable;
2109	} else {
2110		engine->irq_enable = i8xx_irq_enable;
2111		engine->irq_disable = i8xx_irq_disable;
2112	}
2113}
2114
2115static void i9xx_set_default_submission(struct intel_engine_cs *engine)
2116{
2117	engine->submit_request = i9xx_submit_request;
2118}
2119
2120static void gen6_bsd_set_default_submission(struct intel_engine_cs *engine)
2121{
2122	engine->submit_request = gen6_bsd_submit_request;
2123}
2124
2125static void intel_ring_default_vfuncs(struct drm_i915_private *dev_priv,
2126				      struct intel_engine_cs *engine)
2127{
2128	intel_ring_init_irq(dev_priv, engine);
2129	intel_ring_init_semaphores(dev_priv, engine);
2130
2131	engine->init_hw = init_ring_common;
2132	engine->reset_hw = reset_ring_common;
2133
2134	engine->context_pin = intel_ring_context_pin;
2135	engine->context_unpin = intel_ring_context_unpin;
2136
2137	engine->request_alloc = ring_request_alloc;
2138
2139	engine->emit_breadcrumb = i9xx_emit_breadcrumb;
2140	engine->emit_breadcrumb_sz = i9xx_emit_breadcrumb_sz;
2141	if (i915.semaphores) {
2142		int num_rings;
2143
2144		engine->emit_breadcrumb = gen6_sema_emit_breadcrumb;
2145
2146		num_rings = INTEL_INFO(dev_priv)->num_rings - 1;
2147		if (INTEL_GEN(dev_priv) >= 8) {
2148			engine->emit_breadcrumb_sz += num_rings * 6;
2149		} else {
2150			engine->emit_breadcrumb_sz += num_rings * 3;
2151			if (num_rings & 1)
2152				engine->emit_breadcrumb_sz++;
2153		}
2154	}
2155
2156	engine->set_default_submission = i9xx_set_default_submission;
2157
2158	if (INTEL_GEN(dev_priv) >= 8)
2159		engine->emit_bb_start = gen8_emit_bb_start;
2160	else if (INTEL_GEN(dev_priv) >= 6)
2161		engine->emit_bb_start = gen6_emit_bb_start;
2162	else if (INTEL_GEN(dev_priv) >= 4)
2163		engine->emit_bb_start = i965_emit_bb_start;
2164	else if (IS_I830(dev_priv) || IS_I845G(dev_priv))
2165		engine->emit_bb_start = i830_emit_bb_start;
2166	else
2167		engine->emit_bb_start = i915_emit_bb_start;
2168}
2169
2170int intel_init_render_ring_buffer(struct intel_engine_cs *engine)
2171{
2172	struct drm_i915_private *dev_priv = engine->i915;
2173	int ret;
2174
2175	intel_ring_default_vfuncs(dev_priv, engine);
2176
2177	if (HAS_L3_DPF(dev_priv))
2178		engine->irq_keep_mask = GT_RENDER_L3_PARITY_ERROR_INTERRUPT;
2179
2180	if (INTEL_GEN(dev_priv) >= 8) {
2181		engine->init_context = intel_rcs_ctx_init;
2182		engine->emit_breadcrumb = gen8_render_emit_breadcrumb;
2183		engine->emit_breadcrumb_sz = gen8_render_emit_breadcrumb_sz;
2184		engine->emit_flush = gen8_render_ring_flush;
2185		if (i915.semaphores) {
2186			int num_rings;
2187
2188			engine->semaphore.signal = gen8_rcs_signal;
2189
2190			num_rings = INTEL_INFO(dev_priv)->num_rings - 1;
2191			engine->emit_breadcrumb_sz += num_rings * 8;
2192		}
2193	} else if (INTEL_GEN(dev_priv) >= 6) {
2194		engine->init_context = intel_rcs_ctx_init;
2195		engine->emit_flush = gen7_render_ring_flush;
2196		if (IS_GEN6(dev_priv))
2197			engine->emit_flush = gen6_render_ring_flush;
2198	} else if (IS_GEN5(dev_priv)) {
2199		engine->emit_flush = gen4_render_ring_flush;
2200	} else {
2201		if (INTEL_GEN(dev_priv) < 4)
2202			engine->emit_flush = gen2_render_ring_flush;
2203		else
2204			engine->emit_flush = gen4_render_ring_flush;
2205		engine->irq_enable_mask = I915_USER_INTERRUPT;
2206	}
2207
2208	if (IS_HASWELL(dev_priv))
2209		engine->emit_bb_start = hsw_emit_bb_start;
2210
2211	engine->init_hw = init_render_ring;
2212	engine->cleanup = render_ring_cleanup;
2213
2214	ret = intel_init_ring_buffer(engine);
2215	if (ret)
2216		return ret;
2217
2218	if (INTEL_GEN(dev_priv) >= 6) {
2219		ret = intel_engine_create_scratch(engine, PAGE_SIZE);
2220		if (ret)
2221			return ret;
2222	} else if (HAS_BROKEN_CS_TLB(dev_priv)) {
2223		ret = intel_engine_create_scratch(engine, I830_WA_SIZE);
2224		if (ret)
2225			return ret;
2226	}
2227
2228	return 0;
2229}
2230
2231int intel_init_bsd_ring_buffer(struct intel_engine_cs *engine)
2232{
2233	struct drm_i915_private *dev_priv = engine->i915;
2234
2235	intel_ring_default_vfuncs(dev_priv, engine);
2236
2237	if (INTEL_GEN(dev_priv) >= 6) {
2238		/* gen6 bsd needs a special wa for tail updates */
2239		if (IS_GEN6(dev_priv))
2240			engine->set_default_submission = gen6_bsd_set_default_submission;
2241		engine->emit_flush = gen6_bsd_ring_flush;
2242		if (INTEL_GEN(dev_priv) < 8)
2243			engine->irq_enable_mask = GT_BSD_USER_INTERRUPT;
2244	} else {
2245		engine->mmio_base = BSD_RING_BASE;
2246		engine->emit_flush = bsd_ring_flush;
2247		if (IS_GEN5(dev_priv))
2248			engine->irq_enable_mask = ILK_BSD_USER_INTERRUPT;
2249		else
2250			engine->irq_enable_mask = I915_BSD_USER_INTERRUPT;
2251	}
2252
2253	return intel_init_ring_buffer(engine);
2254}
2255
2256int intel_init_blt_ring_buffer(struct intel_engine_cs *engine)
2257{
2258	struct drm_i915_private *dev_priv = engine->i915;
2259
2260	intel_ring_default_vfuncs(dev_priv, engine);
2261
2262	engine->emit_flush = gen6_ring_flush;
2263	if (INTEL_GEN(dev_priv) < 8)
2264		engine->irq_enable_mask = GT_BLT_USER_INTERRUPT;
2265
2266	return intel_init_ring_buffer(engine);
2267}
2268
2269int intel_init_vebox_ring_buffer(struct intel_engine_cs *engine)
2270{
2271	struct drm_i915_private *dev_priv = engine->i915;
2272
2273	intel_ring_default_vfuncs(dev_priv, engine);
2274
2275	engine->emit_flush = gen6_ring_flush;
2276
2277	if (INTEL_GEN(dev_priv) < 8) {
2278		engine->irq_enable_mask = PM_VEBOX_USER_INTERRUPT;
2279		engine->irq_enable = hsw_vebox_irq_enable;
2280		engine->irq_disable = hsw_vebox_irq_disable;
2281	}
2282
2283	return intel_init_ring_buffer(engine);
2284}