Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
at v4.11 2811 lines 76 kB view raw
1/* 2 * Copyright © 2008-2010 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Eric Anholt <eric@anholt.net> 25 * Zou Nan hai <nanhai.zou@intel.com> 26 * Xiang Hai hao<haihao.xiang@intel.com> 27 * 28 */ 29 30#include <linux/log2.h> 31#include <drm/drmP.h> 32#include "i915_drv.h" 33#include <drm/i915_drm.h> 34#include "i915_trace.h" 35#include "intel_drv.h" 36 37/* Rough estimate of the typical request size, performing a flush, 38 * set-context and then emitting the batch. 39 */ 40#define LEGACY_REQUEST_SIZE 200 41 42int __intel_ring_space(int head, int tail, int size) 43{ 44 int space = head - tail; 45 if (space <= 0) 46 space += size; 47 return space - I915_RING_FREE_SPACE; 48} 49 50void intel_ring_update_space(struct intel_ring *ring) 51{ 52 if (ring->last_retired_head != -1) { 53 ring->head = ring->last_retired_head; 54 ring->last_retired_head = -1; 55 } 56 57 ring->space = __intel_ring_space(ring->head & HEAD_ADDR, 58 ring->tail, ring->size); 59} 60 61static int 62gen2_render_ring_flush(struct drm_i915_gem_request *req, u32 mode) 63{ 64 struct intel_ring *ring = req->ring; 65 u32 cmd; 66 int ret; 67 68 cmd = MI_FLUSH; 69 70 if (mode & EMIT_INVALIDATE) 71 cmd |= MI_READ_FLUSH; 72 73 ret = intel_ring_begin(req, 2); 74 if (ret) 75 return ret; 76 77 intel_ring_emit(ring, cmd); 78 intel_ring_emit(ring, MI_NOOP); 79 intel_ring_advance(ring); 80 81 return 0; 82} 83 84static int 85gen4_render_ring_flush(struct drm_i915_gem_request *req, u32 mode) 86{ 87 struct intel_ring *ring = req->ring; 88 u32 cmd; 89 int ret; 90 91 /* 92 * read/write caches: 93 * 94 * I915_GEM_DOMAIN_RENDER is always invalidated, but is 95 * only flushed if MI_NO_WRITE_FLUSH is unset. On 965, it is 96 * also flushed at 2d versus 3d pipeline switches. 97 * 98 * read-only caches: 99 * 100 * I915_GEM_DOMAIN_SAMPLER is flushed on pre-965 if 101 * MI_READ_FLUSH is set, and is always flushed on 965. 102 * 103 * I915_GEM_DOMAIN_COMMAND may not exist? 104 * 105 * I915_GEM_DOMAIN_INSTRUCTION, which exists on 965, is 106 * invalidated when MI_EXE_FLUSH is set. 107 * 108 * I915_GEM_DOMAIN_VERTEX, which exists on 965, is 109 * invalidated with every MI_FLUSH. 110 * 111 * TLBs: 112 * 113 * On 965, TLBs associated with I915_GEM_DOMAIN_COMMAND 114 * and I915_GEM_DOMAIN_CPU in are invalidated at PTE write and 115 * I915_GEM_DOMAIN_RENDER and I915_GEM_DOMAIN_SAMPLER 116 * are flushed at any MI_FLUSH. 117 */ 118 119 cmd = MI_FLUSH; 120 if (mode & EMIT_INVALIDATE) { 121 cmd |= MI_EXE_FLUSH; 122 if (IS_G4X(req->i915) || IS_GEN5(req->i915)) 123 cmd |= MI_INVALIDATE_ISP; 124 } 125 126 ret = intel_ring_begin(req, 2); 127 if (ret) 128 return ret; 129 130 intel_ring_emit(ring, cmd); 131 intel_ring_emit(ring, MI_NOOP); 132 intel_ring_advance(ring); 133 134 return 0; 135} 136 137/** 138 * Emits a PIPE_CONTROL with a non-zero post-sync operation, for 139 * implementing two workarounds on gen6. From section 1.4.7.1 140 * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1: 141 * 142 * [DevSNB-C+{W/A}] Before any depth stall flush (including those 143 * produced by non-pipelined state commands), software needs to first 144 * send a PIPE_CONTROL with no bits set except Post-Sync Operation != 145 * 0. 146 * 147 * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable 148 * =1, a PIPE_CONTROL with any non-zero post-sync-op is required. 149 * 150 * And the workaround for these two requires this workaround first: 151 * 152 * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent 153 * BEFORE the pipe-control with a post-sync op and no write-cache 154 * flushes. 155 * 156 * And this last workaround is tricky because of the requirements on 157 * that bit. From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM 158 * volume 2 part 1: 159 * 160 * "1 of the following must also be set: 161 * - Render Target Cache Flush Enable ([12] of DW1) 162 * - Depth Cache Flush Enable ([0] of DW1) 163 * - Stall at Pixel Scoreboard ([1] of DW1) 164 * - Depth Stall ([13] of DW1) 165 * - Post-Sync Operation ([13] of DW1) 166 * - Notify Enable ([8] of DW1)" 167 * 168 * The cache flushes require the workaround flush that triggered this 169 * one, so we can't use it. Depth stall would trigger the same. 170 * Post-sync nonzero is what triggered this second workaround, so we 171 * can't use that one either. Notify enable is IRQs, which aren't 172 * really our business. That leaves only stall at scoreboard. 173 */ 174static int 175intel_emit_post_sync_nonzero_flush(struct drm_i915_gem_request *req) 176{ 177 struct intel_ring *ring = req->ring; 178 u32 scratch_addr = 179 i915_ggtt_offset(req->engine->scratch) + 2 * CACHELINE_BYTES; 180 int ret; 181 182 ret = intel_ring_begin(req, 6); 183 if (ret) 184 return ret; 185 186 intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(5)); 187 intel_ring_emit(ring, PIPE_CONTROL_CS_STALL | 188 PIPE_CONTROL_STALL_AT_SCOREBOARD); 189 intel_ring_emit(ring, scratch_addr | PIPE_CONTROL_GLOBAL_GTT); 190 intel_ring_emit(ring, 0); /* low dword */ 191 intel_ring_emit(ring, 0); /* high dword */ 192 intel_ring_emit(ring, MI_NOOP); 193 intel_ring_advance(ring); 194 195 ret = intel_ring_begin(req, 6); 196 if (ret) 197 return ret; 198 199 intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(5)); 200 intel_ring_emit(ring, PIPE_CONTROL_QW_WRITE); 201 intel_ring_emit(ring, scratch_addr | PIPE_CONTROL_GLOBAL_GTT); 202 intel_ring_emit(ring, 0); 203 intel_ring_emit(ring, 0); 204 intel_ring_emit(ring, MI_NOOP); 205 intel_ring_advance(ring); 206 207 return 0; 208} 209 210static int 211gen6_render_ring_flush(struct drm_i915_gem_request *req, u32 mode) 212{ 213 struct intel_ring *ring = req->ring; 214 u32 scratch_addr = 215 i915_ggtt_offset(req->engine->scratch) + 2 * CACHELINE_BYTES; 216 u32 flags = 0; 217 int ret; 218 219 /* Force SNB workarounds for PIPE_CONTROL flushes */ 220 ret = intel_emit_post_sync_nonzero_flush(req); 221 if (ret) 222 return ret; 223 224 /* Just flush everything. Experiments have shown that reducing the 225 * number of bits based on the write domains has little performance 226 * impact. 227 */ 228 if (mode & EMIT_FLUSH) { 229 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 230 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 231 /* 232 * Ensure that any following seqno writes only happen 233 * when the render cache is indeed flushed. 234 */ 235 flags |= PIPE_CONTROL_CS_STALL; 236 } 237 if (mode & EMIT_INVALIDATE) { 238 flags |= PIPE_CONTROL_TLB_INVALIDATE; 239 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 240 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 241 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 242 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 243 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 244 /* 245 * TLB invalidate requires a post-sync write. 246 */ 247 flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL; 248 } 249 250 ret = intel_ring_begin(req, 4); 251 if (ret) 252 return ret; 253 254 intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(4)); 255 intel_ring_emit(ring, flags); 256 intel_ring_emit(ring, scratch_addr | PIPE_CONTROL_GLOBAL_GTT); 257 intel_ring_emit(ring, 0); 258 intel_ring_advance(ring); 259 260 return 0; 261} 262 263static int 264gen7_render_ring_cs_stall_wa(struct drm_i915_gem_request *req) 265{ 266 struct intel_ring *ring = req->ring; 267 int ret; 268 269 ret = intel_ring_begin(req, 4); 270 if (ret) 271 return ret; 272 273 intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(4)); 274 intel_ring_emit(ring, 275 PIPE_CONTROL_CS_STALL | 276 PIPE_CONTROL_STALL_AT_SCOREBOARD); 277 intel_ring_emit(ring, 0); 278 intel_ring_emit(ring, 0); 279 intel_ring_advance(ring); 280 281 return 0; 282} 283 284static int 285gen7_render_ring_flush(struct drm_i915_gem_request *req, u32 mode) 286{ 287 struct intel_ring *ring = req->ring; 288 u32 scratch_addr = 289 i915_ggtt_offset(req->engine->scratch) + 2 * CACHELINE_BYTES; 290 u32 flags = 0; 291 int ret; 292 293 /* 294 * Ensure that any following seqno writes only happen when the render 295 * cache is indeed flushed. 296 * 297 * Workaround: 4th PIPE_CONTROL command (except the ones with only 298 * read-cache invalidate bits set) must have the CS_STALL bit set. We 299 * don't try to be clever and just set it unconditionally. 300 */ 301 flags |= PIPE_CONTROL_CS_STALL; 302 303 /* Just flush everything. Experiments have shown that reducing the 304 * number of bits based on the write domains has little performance 305 * impact. 306 */ 307 if (mode & EMIT_FLUSH) { 308 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 309 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 310 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 311 flags |= PIPE_CONTROL_FLUSH_ENABLE; 312 } 313 if (mode & EMIT_INVALIDATE) { 314 flags |= PIPE_CONTROL_TLB_INVALIDATE; 315 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 316 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 317 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 318 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 319 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 320 flags |= PIPE_CONTROL_MEDIA_STATE_CLEAR; 321 /* 322 * TLB invalidate requires a post-sync write. 323 */ 324 flags |= PIPE_CONTROL_QW_WRITE; 325 flags |= PIPE_CONTROL_GLOBAL_GTT_IVB; 326 327 flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD; 328 329 /* Workaround: we must issue a pipe_control with CS-stall bit 330 * set before a pipe_control command that has the state cache 331 * invalidate bit set. */ 332 gen7_render_ring_cs_stall_wa(req); 333 } 334 335 ret = intel_ring_begin(req, 4); 336 if (ret) 337 return ret; 338 339 intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(4)); 340 intel_ring_emit(ring, flags); 341 intel_ring_emit(ring, scratch_addr); 342 intel_ring_emit(ring, 0); 343 intel_ring_advance(ring); 344 345 return 0; 346} 347 348static int 349gen8_emit_pipe_control(struct drm_i915_gem_request *req, 350 u32 flags, u32 scratch_addr) 351{ 352 struct intel_ring *ring = req->ring; 353 int ret; 354 355 ret = intel_ring_begin(req, 6); 356 if (ret) 357 return ret; 358 359 intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6)); 360 intel_ring_emit(ring, flags); 361 intel_ring_emit(ring, scratch_addr); 362 intel_ring_emit(ring, 0); 363 intel_ring_emit(ring, 0); 364 intel_ring_emit(ring, 0); 365 intel_ring_advance(ring); 366 367 return 0; 368} 369 370static int 371gen8_render_ring_flush(struct drm_i915_gem_request *req, u32 mode) 372{ 373 u32 scratch_addr = 374 i915_ggtt_offset(req->engine->scratch) + 2 * CACHELINE_BYTES; 375 u32 flags = 0; 376 int ret; 377 378 flags |= PIPE_CONTROL_CS_STALL; 379 380 if (mode & EMIT_FLUSH) { 381 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 382 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 383 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 384 flags |= PIPE_CONTROL_FLUSH_ENABLE; 385 } 386 if (mode & EMIT_INVALIDATE) { 387 flags |= PIPE_CONTROL_TLB_INVALIDATE; 388 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 389 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 390 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 391 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 392 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 393 flags |= PIPE_CONTROL_QW_WRITE; 394 flags |= PIPE_CONTROL_GLOBAL_GTT_IVB; 395 396 /* WaCsStallBeforeStateCacheInvalidate:bdw,chv */ 397 ret = gen8_emit_pipe_control(req, 398 PIPE_CONTROL_CS_STALL | 399 PIPE_CONTROL_STALL_AT_SCOREBOARD, 400 0); 401 if (ret) 402 return ret; 403 } 404 405 return gen8_emit_pipe_control(req, flags, scratch_addr); 406} 407 408static void ring_setup_phys_status_page(struct intel_engine_cs *engine) 409{ 410 struct drm_i915_private *dev_priv = engine->i915; 411 u32 addr; 412 413 addr = dev_priv->status_page_dmah->busaddr; 414 if (INTEL_GEN(dev_priv) >= 4) 415 addr |= (dev_priv->status_page_dmah->busaddr >> 28) & 0xf0; 416 I915_WRITE(HWS_PGA, addr); 417} 418 419static void intel_ring_setup_status_page(struct intel_engine_cs *engine) 420{ 421 struct drm_i915_private *dev_priv = engine->i915; 422 i915_reg_t mmio; 423 424 /* The ring status page addresses are no longer next to the rest of 425 * the ring registers as of gen7. 426 */ 427 if (IS_GEN7(dev_priv)) { 428 switch (engine->id) { 429 case RCS: 430 mmio = RENDER_HWS_PGA_GEN7; 431 break; 432 case BCS: 433 mmio = BLT_HWS_PGA_GEN7; 434 break; 435 /* 436 * VCS2 actually doesn't exist on Gen7. Only shut up 437 * gcc switch check warning 438 */ 439 case VCS2: 440 case VCS: 441 mmio = BSD_HWS_PGA_GEN7; 442 break; 443 case VECS: 444 mmio = VEBOX_HWS_PGA_GEN7; 445 break; 446 } 447 } else if (IS_GEN6(dev_priv)) { 448 mmio = RING_HWS_PGA_GEN6(engine->mmio_base); 449 } else { 450 /* XXX: gen8 returns to sanity */ 451 mmio = RING_HWS_PGA(engine->mmio_base); 452 } 453 454 I915_WRITE(mmio, engine->status_page.ggtt_offset); 455 POSTING_READ(mmio); 456 457 /* 458 * Flush the TLB for this page 459 * 460 * FIXME: These two bits have disappeared on gen8, so a question 461 * arises: do we still need this and if so how should we go about 462 * invalidating the TLB? 463 */ 464 if (IS_GEN(dev_priv, 6, 7)) { 465 i915_reg_t reg = RING_INSTPM(engine->mmio_base); 466 467 /* ring should be idle before issuing a sync flush*/ 468 WARN_ON((I915_READ_MODE(engine) & MODE_IDLE) == 0); 469 470 I915_WRITE(reg, 471 _MASKED_BIT_ENABLE(INSTPM_TLB_INVALIDATE | 472 INSTPM_SYNC_FLUSH)); 473 if (intel_wait_for_register(dev_priv, 474 reg, INSTPM_SYNC_FLUSH, 0, 475 1000)) 476 DRM_ERROR("%s: wait for SyncFlush to complete for TLB invalidation timed out\n", 477 engine->name); 478 } 479} 480 481static bool stop_ring(struct intel_engine_cs *engine) 482{ 483 struct drm_i915_private *dev_priv = engine->i915; 484 485 if (INTEL_GEN(dev_priv) > 2) { 486 I915_WRITE_MODE(engine, _MASKED_BIT_ENABLE(STOP_RING)); 487 if (intel_wait_for_register(dev_priv, 488 RING_MI_MODE(engine->mmio_base), 489 MODE_IDLE, 490 MODE_IDLE, 491 1000)) { 492 DRM_ERROR("%s : timed out trying to stop ring\n", 493 engine->name); 494 /* Sometimes we observe that the idle flag is not 495 * set even though the ring is empty. So double 496 * check before giving up. 497 */ 498 if (I915_READ_HEAD(engine) != I915_READ_TAIL(engine)) 499 return false; 500 } 501 } 502 503 I915_WRITE_CTL(engine, 0); 504 I915_WRITE_HEAD(engine, 0); 505 I915_WRITE_TAIL(engine, 0); 506 507 if (INTEL_GEN(dev_priv) > 2) { 508 (void)I915_READ_CTL(engine); 509 I915_WRITE_MODE(engine, _MASKED_BIT_DISABLE(STOP_RING)); 510 } 511 512 return (I915_READ_HEAD(engine) & HEAD_ADDR) == 0; 513} 514 515static int init_ring_common(struct intel_engine_cs *engine) 516{ 517 struct drm_i915_private *dev_priv = engine->i915; 518 struct intel_ring *ring = engine->buffer; 519 int ret = 0; 520 521 intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL); 522 523 if (!stop_ring(engine)) { 524 /* G45 ring initialization often fails to reset head to zero */ 525 DRM_DEBUG_KMS("%s head not reset to zero " 526 "ctl %08x head %08x tail %08x start %08x\n", 527 engine->name, 528 I915_READ_CTL(engine), 529 I915_READ_HEAD(engine), 530 I915_READ_TAIL(engine), 531 I915_READ_START(engine)); 532 533 if (!stop_ring(engine)) { 534 DRM_ERROR("failed to set %s head to zero " 535 "ctl %08x head %08x tail %08x start %08x\n", 536 engine->name, 537 I915_READ_CTL(engine), 538 I915_READ_HEAD(engine), 539 I915_READ_TAIL(engine), 540 I915_READ_START(engine)); 541 ret = -EIO; 542 goto out; 543 } 544 } 545 546 if (HWS_NEEDS_PHYSICAL(dev_priv)) 547 ring_setup_phys_status_page(engine); 548 else 549 intel_ring_setup_status_page(engine); 550 551 intel_engine_reset_breadcrumbs(engine); 552 553 /* Enforce ordering by reading HEAD register back */ 554 I915_READ_HEAD(engine); 555 556 /* Initialize the ring. This must happen _after_ we've cleared the ring 557 * registers with the above sequence (the readback of the HEAD registers 558 * also enforces ordering), otherwise the hw might lose the new ring 559 * register values. */ 560 I915_WRITE_START(engine, i915_ggtt_offset(ring->vma)); 561 562 /* WaClearRingBufHeadRegAtInit:ctg,elk */ 563 if (I915_READ_HEAD(engine)) 564 DRM_DEBUG("%s initialization failed [head=%08x], fudging\n", 565 engine->name, I915_READ_HEAD(engine)); 566 567 intel_ring_update_space(ring); 568 I915_WRITE_HEAD(engine, ring->head); 569 I915_WRITE_TAIL(engine, ring->tail); 570 (void)I915_READ_TAIL(engine); 571 572 I915_WRITE_CTL(engine, RING_CTL_SIZE(ring->size) | RING_VALID); 573 574 /* If the head is still not zero, the ring is dead */ 575 if (intel_wait_for_register_fw(dev_priv, RING_CTL(engine->mmio_base), 576 RING_VALID, RING_VALID, 577 50)) { 578 DRM_ERROR("%s initialization failed " 579 "ctl %08x (valid? %d) head %08x [%08x] tail %08x [%08x] start %08x [expected %08x]\n", 580 engine->name, 581 I915_READ_CTL(engine), 582 I915_READ_CTL(engine) & RING_VALID, 583 I915_READ_HEAD(engine), ring->head, 584 I915_READ_TAIL(engine), ring->tail, 585 I915_READ_START(engine), 586 i915_ggtt_offset(ring->vma)); 587 ret = -EIO; 588 goto out; 589 } 590 591 intel_engine_init_hangcheck(engine); 592 593out: 594 intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL); 595 596 return ret; 597} 598 599static void reset_ring_common(struct intel_engine_cs *engine, 600 struct drm_i915_gem_request *request) 601{ 602 /* Try to restore the logical GPU state to match the continuation 603 * of the request queue. If we skip the context/PD restore, then 604 * the next request may try to execute assuming that its context 605 * is valid and loaded on the GPU and so may try to access invalid 606 * memory, prompting repeated GPU hangs. 607 * 608 * If the request was guilty, we still restore the logical state 609 * in case the next request requires it (e.g. the aliasing ppgtt), 610 * but skip over the hung batch. 611 * 612 * If the request was innocent, we try to replay the request with 613 * the restored context. 614 */ 615 if (request) { 616 struct drm_i915_private *dev_priv = request->i915; 617 struct intel_context *ce = &request->ctx->engine[engine->id]; 618 struct i915_hw_ppgtt *ppgtt; 619 620 /* FIXME consider gen8 reset */ 621 622 if (ce->state) { 623 I915_WRITE(CCID, 624 i915_ggtt_offset(ce->state) | 625 BIT(8) /* must be set! */ | 626 CCID_EXTENDED_STATE_SAVE | 627 CCID_EXTENDED_STATE_RESTORE | 628 CCID_EN); 629 } 630 631 ppgtt = request->ctx->ppgtt ?: engine->i915->mm.aliasing_ppgtt; 632 if (ppgtt) { 633 u32 pd_offset = ppgtt->pd.base.ggtt_offset << 10; 634 635 I915_WRITE(RING_PP_DIR_DCLV(engine), PP_DIR_DCLV_2G); 636 I915_WRITE(RING_PP_DIR_BASE(engine), pd_offset); 637 638 /* Wait for the PD reload to complete */ 639 if (intel_wait_for_register(dev_priv, 640 RING_PP_DIR_BASE(engine), 641 BIT(0), 0, 642 10)) 643 DRM_ERROR("Wait for reload of ppgtt page-directory timed out\n"); 644 645 ppgtt->pd_dirty_rings &= ~intel_engine_flag(engine); 646 } 647 648 /* If the rq hung, jump to its breadcrumb and skip the batch */ 649 if (request->fence.error == -EIO) { 650 struct intel_ring *ring = request->ring; 651 652 ring->head = request->postfix; 653 ring->last_retired_head = -1; 654 } 655 } else { 656 engine->legacy_active_context = NULL; 657 } 658} 659 660static int intel_ring_workarounds_emit(struct drm_i915_gem_request *req) 661{ 662 struct intel_ring *ring = req->ring; 663 struct i915_workarounds *w = &req->i915->workarounds; 664 int ret, i; 665 666 if (w->count == 0) 667 return 0; 668 669 ret = req->engine->emit_flush(req, EMIT_BARRIER); 670 if (ret) 671 return ret; 672 673 ret = intel_ring_begin(req, (w->count * 2 + 2)); 674 if (ret) 675 return ret; 676 677 intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(w->count)); 678 for (i = 0; i < w->count; i++) { 679 intel_ring_emit_reg(ring, w->reg[i].addr); 680 intel_ring_emit(ring, w->reg[i].value); 681 } 682 intel_ring_emit(ring, MI_NOOP); 683 684 intel_ring_advance(ring); 685 686 ret = req->engine->emit_flush(req, EMIT_BARRIER); 687 if (ret) 688 return ret; 689 690 DRM_DEBUG_DRIVER("Number of Workarounds emitted: %d\n", w->count); 691 692 return 0; 693} 694 695static int intel_rcs_ctx_init(struct drm_i915_gem_request *req) 696{ 697 int ret; 698 699 ret = intel_ring_workarounds_emit(req); 700 if (ret != 0) 701 return ret; 702 703 ret = i915_gem_render_state_emit(req); 704 if (ret) 705 return ret; 706 707 return 0; 708} 709 710static int wa_add(struct drm_i915_private *dev_priv, 711 i915_reg_t addr, 712 const u32 mask, const u32 val) 713{ 714 const u32 idx = dev_priv->workarounds.count; 715 716 if (WARN_ON(idx >= I915_MAX_WA_REGS)) 717 return -ENOSPC; 718 719 dev_priv->workarounds.reg[idx].addr = addr; 720 dev_priv->workarounds.reg[idx].value = val; 721 dev_priv->workarounds.reg[idx].mask = mask; 722 723 dev_priv->workarounds.count++; 724 725 return 0; 726} 727 728#define WA_REG(addr, mask, val) do { \ 729 const int r = wa_add(dev_priv, (addr), (mask), (val)); \ 730 if (r) \ 731 return r; \ 732 } while (0) 733 734#define WA_SET_BIT_MASKED(addr, mask) \ 735 WA_REG(addr, (mask), _MASKED_BIT_ENABLE(mask)) 736 737#define WA_CLR_BIT_MASKED(addr, mask) \ 738 WA_REG(addr, (mask), _MASKED_BIT_DISABLE(mask)) 739 740#define WA_SET_FIELD_MASKED(addr, mask, value) \ 741 WA_REG(addr, mask, _MASKED_FIELD(mask, value)) 742 743#define WA_SET_BIT(addr, mask) WA_REG(addr, mask, I915_READ(addr) | (mask)) 744#define WA_CLR_BIT(addr, mask) WA_REG(addr, mask, I915_READ(addr) & ~(mask)) 745 746#define WA_WRITE(addr, val) WA_REG(addr, 0xffffffff, val) 747 748static int wa_ring_whitelist_reg(struct intel_engine_cs *engine, 749 i915_reg_t reg) 750{ 751 struct drm_i915_private *dev_priv = engine->i915; 752 struct i915_workarounds *wa = &dev_priv->workarounds; 753 const uint32_t index = wa->hw_whitelist_count[engine->id]; 754 755 if (WARN_ON(index >= RING_MAX_NONPRIV_SLOTS)) 756 return -EINVAL; 757 758 WA_WRITE(RING_FORCE_TO_NONPRIV(engine->mmio_base, index), 759 i915_mmio_reg_offset(reg)); 760 wa->hw_whitelist_count[engine->id]++; 761 762 return 0; 763} 764 765static int gen8_init_workarounds(struct intel_engine_cs *engine) 766{ 767 struct drm_i915_private *dev_priv = engine->i915; 768 769 WA_SET_BIT_MASKED(INSTPM, INSTPM_FORCE_ORDERING); 770 771 /* WaDisableAsyncFlipPerfMode:bdw,chv */ 772 WA_SET_BIT_MASKED(MI_MODE, ASYNC_FLIP_PERF_DISABLE); 773 774 /* WaDisablePartialInstShootdown:bdw,chv */ 775 WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, 776 PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE); 777 778 /* Use Force Non-Coherent whenever executing a 3D context. This is a 779 * workaround for for a possible hang in the unlikely event a TLB 780 * invalidation occurs during a PSD flush. 781 */ 782 /* WaForceEnableNonCoherent:bdw,chv */ 783 /* WaHdcDisableFetchWhenMasked:bdw,chv */ 784 WA_SET_BIT_MASKED(HDC_CHICKEN0, 785 HDC_DONOT_FETCH_MEM_WHEN_MASKED | 786 HDC_FORCE_NON_COHERENT); 787 788 /* From the Haswell PRM, Command Reference: Registers, CACHE_MODE_0: 789 * "The Hierarchical Z RAW Stall Optimization allows non-overlapping 790 * polygons in the same 8x4 pixel/sample area to be processed without 791 * stalling waiting for the earlier ones to write to Hierarchical Z 792 * buffer." 793 * 794 * This optimization is off by default for BDW and CHV; turn it on. 795 */ 796 WA_CLR_BIT_MASKED(CACHE_MODE_0_GEN7, HIZ_RAW_STALL_OPT_DISABLE); 797 798 /* Wa4x4STCOptimizationDisable:bdw,chv */ 799 WA_SET_BIT_MASKED(CACHE_MODE_1, GEN8_4x4_STC_OPTIMIZATION_DISABLE); 800 801 /* 802 * BSpec recommends 8x4 when MSAA is used, 803 * however in practice 16x4 seems fastest. 804 * 805 * Note that PS/WM thread counts depend on the WIZ hashing 806 * disable bit, which we don't touch here, but it's good 807 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM). 808 */ 809 WA_SET_FIELD_MASKED(GEN7_GT_MODE, 810 GEN6_WIZ_HASHING_MASK, 811 GEN6_WIZ_HASHING_16x4); 812 813 return 0; 814} 815 816static int bdw_init_workarounds(struct intel_engine_cs *engine) 817{ 818 struct drm_i915_private *dev_priv = engine->i915; 819 int ret; 820 821 ret = gen8_init_workarounds(engine); 822 if (ret) 823 return ret; 824 825 /* WaDisableThreadStallDopClockGating:bdw (pre-production) */ 826 WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE); 827 828 /* WaDisableDopClockGating:bdw */ 829 WA_SET_BIT_MASKED(GEN7_ROW_CHICKEN2, 830 DOP_CLOCK_GATING_DISABLE); 831 832 WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3, 833 GEN8_SAMPLER_POWER_BYPASS_DIS); 834 835 WA_SET_BIT_MASKED(HDC_CHICKEN0, 836 /* WaForceContextSaveRestoreNonCoherent:bdw */ 837 HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT | 838 /* WaDisableFenceDestinationToSLM:bdw (pre-prod) */ 839 (IS_BDW_GT3(dev_priv) ? HDC_FENCE_DEST_SLM_DISABLE : 0)); 840 841 return 0; 842} 843 844static int chv_init_workarounds(struct intel_engine_cs *engine) 845{ 846 struct drm_i915_private *dev_priv = engine->i915; 847 int ret; 848 849 ret = gen8_init_workarounds(engine); 850 if (ret) 851 return ret; 852 853 /* WaDisableThreadStallDopClockGating:chv */ 854 WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE); 855 856 /* Improve HiZ throughput on CHV. */ 857 WA_SET_BIT_MASKED(HIZ_CHICKEN, CHV_HZ_8X8_MODE_IN_1X); 858 859 return 0; 860} 861 862static int gen9_init_workarounds(struct intel_engine_cs *engine) 863{ 864 struct drm_i915_private *dev_priv = engine->i915; 865 int ret; 866 867 /* WaConextSwitchWithConcurrentTLBInvalidate:skl,bxt,kbl */ 868 I915_WRITE(GEN9_CSFE_CHICKEN1_RCS, _MASKED_BIT_ENABLE(GEN9_PREEMPT_GPGPU_SYNC_SWITCH_DISABLE)); 869 870 /* WaEnableLbsSlaRetryTimerDecrement:skl,bxt,kbl */ 871 I915_WRITE(BDW_SCRATCH1, I915_READ(BDW_SCRATCH1) | 872 GEN9_LBS_SLA_RETRY_TIMER_DECREMENT_ENABLE); 873 874 /* WaDisableKillLogic:bxt,skl,kbl */ 875 I915_WRITE(GAM_ECOCHK, I915_READ(GAM_ECOCHK) | 876 ECOCHK_DIS_TLB); 877 878 /* WaClearFlowControlGpgpuContextSave:skl,bxt,kbl */ 879 /* WaDisablePartialInstShootdown:skl,bxt,kbl */ 880 WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, 881 FLOW_CONTROL_ENABLE | 882 PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE); 883 884 /* Syncing dependencies between camera and graphics:skl,bxt,kbl */ 885 WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3, 886 GEN9_DISABLE_OCL_OOB_SUPPRESS_LOGIC); 887 888 /* WaDisableDgMirrorFixInHalfSliceChicken5:bxt */ 889 if (IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1)) 890 WA_CLR_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN5, 891 GEN9_DG_MIRROR_FIX_ENABLE); 892 893 /* WaSetDisablePixMaskCammingAndRhwoInCommonSliceChicken:bxt */ 894 if (IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1)) { 895 WA_SET_BIT_MASKED(GEN7_COMMON_SLICE_CHICKEN1, 896 GEN9_RHWO_OPTIMIZATION_DISABLE); 897 /* 898 * WA also requires GEN9_SLICE_COMMON_ECO_CHICKEN0[14:14] to be set 899 * but we do that in per ctx batchbuffer as there is an issue 900 * with this register not getting restored on ctx restore 901 */ 902 } 903 904 /* WaEnableSamplerGPGPUPreemptionSupport:skl,bxt,kbl */ 905 WA_SET_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN7, 906 GEN9_ENABLE_GPGPU_PREEMPTION); 907 908 /* Wa4x4STCOptimizationDisable:skl,bxt,kbl */ 909 /* WaDisablePartialResolveInVc:skl,bxt,kbl */ 910 WA_SET_BIT_MASKED(CACHE_MODE_1, (GEN8_4x4_STC_OPTIMIZATION_DISABLE | 911 GEN9_PARTIAL_RESOLVE_IN_VC_DISABLE)); 912 913 /* WaCcsTlbPrefetchDisable:skl,bxt,kbl */ 914 WA_CLR_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN5, 915 GEN9_CCS_TLB_PREFETCH_ENABLE); 916 917 /* WaDisableMaskBasedCammingInRCC:bxt */ 918 if (IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1)) 919 WA_SET_BIT_MASKED(SLICE_ECO_CHICKEN0, 920 PIXEL_MASK_CAMMING_DISABLE); 921 922 /* WaForceContextSaveRestoreNonCoherent:skl,bxt,kbl */ 923 WA_SET_BIT_MASKED(HDC_CHICKEN0, 924 HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT | 925 HDC_FORCE_CSR_NON_COHERENT_OVR_DISABLE); 926 927 /* WaForceEnableNonCoherent and WaDisableHDCInvalidation are 928 * both tied to WaForceContextSaveRestoreNonCoherent 929 * in some hsds for skl. We keep the tie for all gen9. The 930 * documentation is a bit hazy and so we want to get common behaviour, 931 * even though there is no clear evidence we would need both on kbl/bxt. 932 * This area has been source of system hangs so we play it safe 933 * and mimic the skl regardless of what bspec says. 934 * 935 * Use Force Non-Coherent whenever executing a 3D context. This 936 * is a workaround for a possible hang in the unlikely event 937 * a TLB invalidation occurs during a PSD flush. 938 */ 939 940 /* WaForceEnableNonCoherent:skl,bxt,kbl */ 941 WA_SET_BIT_MASKED(HDC_CHICKEN0, 942 HDC_FORCE_NON_COHERENT); 943 944 /* WaDisableHDCInvalidation:skl,bxt,kbl */ 945 I915_WRITE(GAM_ECOCHK, I915_READ(GAM_ECOCHK) | 946 BDW_DISABLE_HDC_INVALIDATION); 947 948 /* WaDisableSamplerPowerBypassForSOPingPong:skl,bxt,kbl */ 949 if (IS_SKYLAKE(dev_priv) || 950 IS_KABYLAKE(dev_priv) || 951 IS_BXT_REVID(dev_priv, 0, BXT_REVID_B0)) 952 WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3, 953 GEN8_SAMPLER_POWER_BYPASS_DIS); 954 955 /* WaDisableSTUnitPowerOptimization:skl,bxt,kbl */ 956 WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN2, GEN8_ST_PO_DISABLE); 957 958 /* WaOCLCoherentLineFlush:skl,bxt,kbl */ 959 I915_WRITE(GEN8_L3SQCREG4, (I915_READ(GEN8_L3SQCREG4) | 960 GEN8_LQSC_FLUSH_COHERENT_LINES)); 961 962 /* WaVFEStateAfterPipeControlwithMediaStateClear:skl,bxt */ 963 ret = wa_ring_whitelist_reg(engine, GEN9_CTX_PREEMPT_REG); 964 if (ret) 965 return ret; 966 967 /* WaEnablePreemptionGranularityControlByUMD:skl,bxt,kbl */ 968 ret= wa_ring_whitelist_reg(engine, GEN8_CS_CHICKEN1); 969 if (ret) 970 return ret; 971 972 /* WaAllowUMDToModifyHDCChicken1:skl,bxt,kbl */ 973 ret = wa_ring_whitelist_reg(engine, GEN8_HDC_CHICKEN1); 974 if (ret) 975 return ret; 976 977 return 0; 978} 979 980static int skl_tune_iz_hashing(struct intel_engine_cs *engine) 981{ 982 struct drm_i915_private *dev_priv = engine->i915; 983 u8 vals[3] = { 0, 0, 0 }; 984 unsigned int i; 985 986 for (i = 0; i < 3; i++) { 987 u8 ss; 988 989 /* 990 * Only consider slices where one, and only one, subslice has 7 991 * EUs 992 */ 993 if (!is_power_of_2(INTEL_INFO(dev_priv)->sseu.subslice_7eu[i])) 994 continue; 995 996 /* 997 * subslice_7eu[i] != 0 (because of the check above) and 998 * ss_max == 4 (maximum number of subslices possible per slice) 999 * 1000 * -> 0 <= ss <= 3; 1001 */ 1002 ss = ffs(INTEL_INFO(dev_priv)->sseu.subslice_7eu[i]) - 1; 1003 vals[i] = 3 - ss; 1004 } 1005 1006 if (vals[0] == 0 && vals[1] == 0 && vals[2] == 0) 1007 return 0; 1008 1009 /* Tune IZ hashing. See intel_device_info_runtime_init() */ 1010 WA_SET_FIELD_MASKED(GEN7_GT_MODE, 1011 GEN9_IZ_HASHING_MASK(2) | 1012 GEN9_IZ_HASHING_MASK(1) | 1013 GEN9_IZ_HASHING_MASK(0), 1014 GEN9_IZ_HASHING(2, vals[2]) | 1015 GEN9_IZ_HASHING(1, vals[1]) | 1016 GEN9_IZ_HASHING(0, vals[0])); 1017 1018 return 0; 1019} 1020 1021static int skl_init_workarounds(struct intel_engine_cs *engine) 1022{ 1023 struct drm_i915_private *dev_priv = engine->i915; 1024 int ret; 1025 1026 ret = gen9_init_workarounds(engine); 1027 if (ret) 1028 return ret; 1029 1030 /* 1031 * Actual WA is to disable percontext preemption granularity control 1032 * until D0 which is the default case so this is equivalent to 1033 * !WaDisablePerCtxtPreemptionGranularityControl:skl 1034 */ 1035 I915_WRITE(GEN7_FF_SLICE_CS_CHICKEN1, 1036 _MASKED_BIT_ENABLE(GEN9_FFSC_PERCTX_PREEMPT_CTRL)); 1037 1038 /* WaEnableGapsTsvCreditFix:skl */ 1039 I915_WRITE(GEN8_GARBCNTL, (I915_READ(GEN8_GARBCNTL) | 1040 GEN9_GAPS_TSV_CREDIT_DISABLE)); 1041 1042 /* WaDisableGafsUnitClkGating:skl */ 1043 WA_SET_BIT(GEN7_UCGCTL4, GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE); 1044 1045 /* WaInPlaceDecompressionHang:skl */ 1046 if (IS_SKL_REVID(dev_priv, SKL_REVID_H0, REVID_FOREVER)) 1047 WA_SET_BIT(GEN9_GAMT_ECO_REG_RW_IA, 1048 GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS); 1049 1050 /* WaDisableLSQCROPERFforOCL:skl */ 1051 ret = wa_ring_whitelist_reg(engine, GEN8_L3SQCREG4); 1052 if (ret) 1053 return ret; 1054 1055 return skl_tune_iz_hashing(engine); 1056} 1057 1058static int bxt_init_workarounds(struct intel_engine_cs *engine) 1059{ 1060 struct drm_i915_private *dev_priv = engine->i915; 1061 int ret; 1062 1063 ret = gen9_init_workarounds(engine); 1064 if (ret) 1065 return ret; 1066 1067 /* WaStoreMultiplePTEenable:bxt */ 1068 /* This is a requirement according to Hardware specification */ 1069 if (IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1)) 1070 I915_WRITE(TILECTL, I915_READ(TILECTL) | TILECTL_TLBPF); 1071 1072 /* WaSetClckGatingDisableMedia:bxt */ 1073 if (IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1)) { 1074 I915_WRITE(GEN7_MISCCPCTL, (I915_READ(GEN7_MISCCPCTL) & 1075 ~GEN8_DOP_CLOCK_GATE_MEDIA_ENABLE)); 1076 } 1077 1078 /* WaDisableThreadStallDopClockGating:bxt */ 1079 WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, 1080 STALL_DOP_GATING_DISABLE); 1081 1082 /* WaDisablePooledEuLoadBalancingFix:bxt */ 1083 if (IS_BXT_REVID(dev_priv, BXT_REVID_B0, REVID_FOREVER)) { 1084 WA_SET_BIT_MASKED(FF_SLICE_CS_CHICKEN2, 1085 GEN9_POOLED_EU_LOAD_BALANCING_FIX_DISABLE); 1086 } 1087 1088 /* WaDisableSbeCacheDispatchPortSharing:bxt */ 1089 if (IS_BXT_REVID(dev_priv, 0, BXT_REVID_B0)) { 1090 WA_SET_BIT_MASKED( 1091 GEN7_HALF_SLICE_CHICKEN1, 1092 GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE); 1093 } 1094 1095 /* WaDisableObjectLevelPreemptionForTrifanOrPolygon:bxt */ 1096 /* WaDisableObjectLevelPreemptionForInstancedDraw:bxt */ 1097 /* WaDisableObjectLevelPreemtionForInstanceId:bxt */ 1098 /* WaDisableLSQCROPERFforOCL:bxt */ 1099 if (IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1)) { 1100 ret = wa_ring_whitelist_reg(engine, GEN9_CS_DEBUG_MODE1); 1101 if (ret) 1102 return ret; 1103 1104 ret = wa_ring_whitelist_reg(engine, GEN8_L3SQCREG4); 1105 if (ret) 1106 return ret; 1107 } 1108 1109 /* WaProgramL3SqcReg1DefaultForPerf:bxt */ 1110 if (IS_BXT_REVID(dev_priv, BXT_REVID_B0, REVID_FOREVER)) 1111 I915_WRITE(GEN8_L3SQCREG1, L3_GENERAL_PRIO_CREDITS(62) | 1112 L3_HIGH_PRIO_CREDITS(2)); 1113 1114 /* WaToEnableHwFixForPushConstHWBug:bxt */ 1115 if (IS_BXT_REVID(dev_priv, BXT_REVID_C0, REVID_FOREVER)) 1116 WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2, 1117 GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION); 1118 1119 /* WaInPlaceDecompressionHang:bxt */ 1120 if (IS_BXT_REVID(dev_priv, BXT_REVID_C0, REVID_FOREVER)) 1121 WA_SET_BIT(GEN9_GAMT_ECO_REG_RW_IA, 1122 GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS); 1123 1124 return 0; 1125} 1126 1127static int kbl_init_workarounds(struct intel_engine_cs *engine) 1128{ 1129 struct drm_i915_private *dev_priv = engine->i915; 1130 int ret; 1131 1132 ret = gen9_init_workarounds(engine); 1133 if (ret) 1134 return ret; 1135 1136 /* WaEnableGapsTsvCreditFix:kbl */ 1137 I915_WRITE(GEN8_GARBCNTL, (I915_READ(GEN8_GARBCNTL) | 1138 GEN9_GAPS_TSV_CREDIT_DISABLE)); 1139 1140 /* WaDisableDynamicCreditSharing:kbl */ 1141 if (IS_KBL_REVID(dev_priv, 0, KBL_REVID_B0)) 1142 WA_SET_BIT(GAMT_CHKN_BIT_REG, 1143 GAMT_CHKN_DISABLE_DYNAMIC_CREDIT_SHARING); 1144 1145 /* WaDisableFenceDestinationToSLM:kbl (pre-prod) */ 1146 if (IS_KBL_REVID(dev_priv, KBL_REVID_A0, KBL_REVID_A0)) 1147 WA_SET_BIT_MASKED(HDC_CHICKEN0, 1148 HDC_FENCE_DEST_SLM_DISABLE); 1149 1150 /* WaToEnableHwFixForPushConstHWBug:kbl */ 1151 if (IS_KBL_REVID(dev_priv, KBL_REVID_C0, REVID_FOREVER)) 1152 WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2, 1153 GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION); 1154 1155 /* WaDisableGafsUnitClkGating:kbl */ 1156 WA_SET_BIT(GEN7_UCGCTL4, GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE); 1157 1158 /* WaDisableSbeCacheDispatchPortSharing:kbl */ 1159 WA_SET_BIT_MASKED( 1160 GEN7_HALF_SLICE_CHICKEN1, 1161 GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE); 1162 1163 /* WaInPlaceDecompressionHang:kbl */ 1164 WA_SET_BIT(GEN9_GAMT_ECO_REG_RW_IA, 1165 GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS); 1166 1167 /* WaDisableLSQCROPERFforOCL:kbl */ 1168 ret = wa_ring_whitelist_reg(engine, GEN8_L3SQCREG4); 1169 if (ret) 1170 return ret; 1171 1172 return 0; 1173} 1174 1175int init_workarounds_ring(struct intel_engine_cs *engine) 1176{ 1177 struct drm_i915_private *dev_priv = engine->i915; 1178 1179 WARN_ON(engine->id != RCS); 1180 1181 dev_priv->workarounds.count = 0; 1182 dev_priv->workarounds.hw_whitelist_count[RCS] = 0; 1183 1184 if (IS_BROADWELL(dev_priv)) 1185 return bdw_init_workarounds(engine); 1186 1187 if (IS_CHERRYVIEW(dev_priv)) 1188 return chv_init_workarounds(engine); 1189 1190 if (IS_SKYLAKE(dev_priv)) 1191 return skl_init_workarounds(engine); 1192 1193 if (IS_BROXTON(dev_priv)) 1194 return bxt_init_workarounds(engine); 1195 1196 if (IS_KABYLAKE(dev_priv)) 1197 return kbl_init_workarounds(engine); 1198 1199 return 0; 1200} 1201 1202static int init_render_ring(struct intel_engine_cs *engine) 1203{ 1204 struct drm_i915_private *dev_priv = engine->i915; 1205 int ret = init_ring_common(engine); 1206 if (ret) 1207 return ret; 1208 1209 /* WaTimedSingleVertexDispatch:cl,bw,ctg,elk,ilk,snb */ 1210 if (IS_GEN(dev_priv, 4, 6)) 1211 I915_WRITE(MI_MODE, _MASKED_BIT_ENABLE(VS_TIMER_DISPATCH)); 1212 1213 /* We need to disable the AsyncFlip performance optimisations in order 1214 * to use MI_WAIT_FOR_EVENT within the CS. It should already be 1215 * programmed to '1' on all products. 1216 * 1217 * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv 1218 */ 1219 if (IS_GEN(dev_priv, 6, 7)) 1220 I915_WRITE(MI_MODE, _MASKED_BIT_ENABLE(ASYNC_FLIP_PERF_DISABLE)); 1221 1222 /* Required for the hardware to program scanline values for waiting */ 1223 /* WaEnableFlushTlbInvalidationMode:snb */ 1224 if (IS_GEN6(dev_priv)) 1225 I915_WRITE(GFX_MODE, 1226 _MASKED_BIT_ENABLE(GFX_TLB_INVALIDATE_EXPLICIT)); 1227 1228 /* WaBCSVCSTlbInvalidationMode:ivb,vlv,hsw */ 1229 if (IS_GEN7(dev_priv)) 1230 I915_WRITE(GFX_MODE_GEN7, 1231 _MASKED_BIT_ENABLE(GFX_TLB_INVALIDATE_EXPLICIT) | 1232 _MASKED_BIT_ENABLE(GFX_REPLAY_MODE)); 1233 1234 if (IS_GEN6(dev_priv)) { 1235 /* From the Sandybridge PRM, volume 1 part 3, page 24: 1236 * "If this bit is set, STCunit will have LRA as replacement 1237 * policy. [...] This bit must be reset. LRA replacement 1238 * policy is not supported." 1239 */ 1240 I915_WRITE(CACHE_MODE_0, 1241 _MASKED_BIT_DISABLE(CM0_STC_EVICT_DISABLE_LRA_SNB)); 1242 } 1243 1244 if (IS_GEN(dev_priv, 6, 7)) 1245 I915_WRITE(INSTPM, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING)); 1246 1247 if (INTEL_INFO(dev_priv)->gen >= 6) 1248 I915_WRITE_IMR(engine, ~engine->irq_keep_mask); 1249 1250 return init_workarounds_ring(engine); 1251} 1252 1253static void render_ring_cleanup(struct intel_engine_cs *engine) 1254{ 1255 struct drm_i915_private *dev_priv = engine->i915; 1256 1257 i915_vma_unpin_and_release(&dev_priv->semaphore); 1258} 1259 1260static u32 *gen8_rcs_signal(struct drm_i915_gem_request *req, u32 *out) 1261{ 1262 struct drm_i915_private *dev_priv = req->i915; 1263 struct intel_engine_cs *waiter; 1264 enum intel_engine_id id; 1265 1266 for_each_engine(waiter, dev_priv, id) { 1267 u64 gtt_offset = req->engine->semaphore.signal_ggtt[id]; 1268 if (gtt_offset == MI_SEMAPHORE_SYNC_INVALID) 1269 continue; 1270 1271 *out++ = GFX_OP_PIPE_CONTROL(6); 1272 *out++ = (PIPE_CONTROL_GLOBAL_GTT_IVB | 1273 PIPE_CONTROL_QW_WRITE | 1274 PIPE_CONTROL_CS_STALL); 1275 *out++ = lower_32_bits(gtt_offset); 1276 *out++ = upper_32_bits(gtt_offset); 1277 *out++ = req->global_seqno; 1278 *out++ = 0; 1279 *out++ = (MI_SEMAPHORE_SIGNAL | 1280 MI_SEMAPHORE_TARGET(waiter->hw_id)); 1281 *out++ = 0; 1282 } 1283 1284 return out; 1285} 1286 1287static u32 *gen8_xcs_signal(struct drm_i915_gem_request *req, u32 *out) 1288{ 1289 struct drm_i915_private *dev_priv = req->i915; 1290 struct intel_engine_cs *waiter; 1291 enum intel_engine_id id; 1292 1293 for_each_engine(waiter, dev_priv, id) { 1294 u64 gtt_offset = req->engine->semaphore.signal_ggtt[id]; 1295 if (gtt_offset == MI_SEMAPHORE_SYNC_INVALID) 1296 continue; 1297 1298 *out++ = (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW; 1299 *out++ = lower_32_bits(gtt_offset) | MI_FLUSH_DW_USE_GTT; 1300 *out++ = upper_32_bits(gtt_offset); 1301 *out++ = req->global_seqno; 1302 *out++ = (MI_SEMAPHORE_SIGNAL | 1303 MI_SEMAPHORE_TARGET(waiter->hw_id)); 1304 *out++ = 0; 1305 } 1306 1307 return out; 1308} 1309 1310static u32 *gen6_signal(struct drm_i915_gem_request *req, u32 *out) 1311{ 1312 struct drm_i915_private *dev_priv = req->i915; 1313 struct intel_engine_cs *engine; 1314 enum intel_engine_id id; 1315 int num_rings = 0; 1316 1317 for_each_engine(engine, dev_priv, id) { 1318 i915_reg_t mbox_reg; 1319 1320 if (!(BIT(engine->hw_id) & GEN6_SEMAPHORES_MASK)) 1321 continue; 1322 1323 mbox_reg = req->engine->semaphore.mbox.signal[engine->hw_id]; 1324 if (i915_mmio_reg_valid(mbox_reg)) { 1325 *out++ = MI_LOAD_REGISTER_IMM(1); 1326 *out++ = i915_mmio_reg_offset(mbox_reg); 1327 *out++ = req->global_seqno; 1328 num_rings++; 1329 } 1330 } 1331 if (num_rings & 1) 1332 *out++ = MI_NOOP; 1333 1334 return out; 1335} 1336 1337static void i9xx_submit_request(struct drm_i915_gem_request *request) 1338{ 1339 struct drm_i915_private *dev_priv = request->i915; 1340 1341 i915_gem_request_submit(request); 1342 1343 I915_WRITE_TAIL(request->engine, request->tail); 1344} 1345 1346static void i9xx_emit_breadcrumb(struct drm_i915_gem_request *req, 1347 u32 *out) 1348{ 1349 *out++ = MI_STORE_DWORD_INDEX; 1350 *out++ = I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT; 1351 *out++ = req->global_seqno; 1352 *out++ = MI_USER_INTERRUPT; 1353 1354 req->tail = intel_ring_offset(req->ring, out); 1355} 1356 1357static const int i9xx_emit_breadcrumb_sz = 4; 1358 1359/** 1360 * gen6_sema_emit_breadcrumb - Update the semaphore mailbox registers 1361 * 1362 * @request - request to write to the ring 1363 * 1364 * Update the mailbox registers in the *other* rings with the current seqno. 1365 * This acts like a signal in the canonical semaphore. 1366 */ 1367static void gen6_sema_emit_breadcrumb(struct drm_i915_gem_request *req, 1368 u32 *out) 1369{ 1370 return i9xx_emit_breadcrumb(req, 1371 req->engine->semaphore.signal(req, out)); 1372} 1373 1374static void gen8_render_emit_breadcrumb(struct drm_i915_gem_request *req, 1375 u32 *out) 1376{ 1377 struct intel_engine_cs *engine = req->engine; 1378 1379 if (engine->semaphore.signal) 1380 out = engine->semaphore.signal(req, out); 1381 1382 *out++ = GFX_OP_PIPE_CONTROL(6); 1383 *out++ = (PIPE_CONTROL_GLOBAL_GTT_IVB | 1384 PIPE_CONTROL_CS_STALL | 1385 PIPE_CONTROL_QW_WRITE); 1386 *out++ = intel_hws_seqno_address(engine); 1387 *out++ = 0; 1388 *out++ = req->global_seqno; 1389 /* We're thrashing one dword of HWS. */ 1390 *out++ = 0; 1391 *out++ = MI_USER_INTERRUPT; 1392 *out++ = MI_NOOP; 1393 1394 req->tail = intel_ring_offset(req->ring, out); 1395} 1396 1397static const int gen8_render_emit_breadcrumb_sz = 8; 1398 1399/** 1400 * intel_ring_sync - sync the waiter to the signaller on seqno 1401 * 1402 * @waiter - ring that is waiting 1403 * @signaller - ring which has, or will signal 1404 * @seqno - seqno which the waiter will block on 1405 */ 1406 1407static int 1408gen8_ring_sync_to(struct drm_i915_gem_request *req, 1409 struct drm_i915_gem_request *signal) 1410{ 1411 struct intel_ring *ring = req->ring; 1412 struct drm_i915_private *dev_priv = req->i915; 1413 u64 offset = GEN8_WAIT_OFFSET(req->engine, signal->engine->id); 1414 struct i915_hw_ppgtt *ppgtt; 1415 int ret; 1416 1417 ret = intel_ring_begin(req, 4); 1418 if (ret) 1419 return ret; 1420 1421 intel_ring_emit(ring, 1422 MI_SEMAPHORE_WAIT | 1423 MI_SEMAPHORE_GLOBAL_GTT | 1424 MI_SEMAPHORE_SAD_GTE_SDD); 1425 intel_ring_emit(ring, signal->global_seqno); 1426 intel_ring_emit(ring, lower_32_bits(offset)); 1427 intel_ring_emit(ring, upper_32_bits(offset)); 1428 intel_ring_advance(ring); 1429 1430 /* When the !RCS engines idle waiting upon a semaphore, they lose their 1431 * pagetables and we must reload them before executing the batch. 1432 * We do this on the i915_switch_context() following the wait and 1433 * before the dispatch. 1434 */ 1435 ppgtt = req->ctx->ppgtt; 1436 if (ppgtt && req->engine->id != RCS) 1437 ppgtt->pd_dirty_rings |= intel_engine_flag(req->engine); 1438 return 0; 1439} 1440 1441static int 1442gen6_ring_sync_to(struct drm_i915_gem_request *req, 1443 struct drm_i915_gem_request *signal) 1444{ 1445 struct intel_ring *ring = req->ring; 1446 u32 dw1 = MI_SEMAPHORE_MBOX | 1447 MI_SEMAPHORE_COMPARE | 1448 MI_SEMAPHORE_REGISTER; 1449 u32 wait_mbox = signal->engine->semaphore.mbox.wait[req->engine->hw_id]; 1450 int ret; 1451 1452 WARN_ON(wait_mbox == MI_SEMAPHORE_SYNC_INVALID); 1453 1454 ret = intel_ring_begin(req, 4); 1455 if (ret) 1456 return ret; 1457 1458 intel_ring_emit(ring, dw1 | wait_mbox); 1459 /* Throughout all of the GEM code, seqno passed implies our current 1460 * seqno is >= the last seqno executed. However for hardware the 1461 * comparison is strictly greater than. 1462 */ 1463 intel_ring_emit(ring, signal->global_seqno - 1); 1464 intel_ring_emit(ring, 0); 1465 intel_ring_emit(ring, MI_NOOP); 1466 intel_ring_advance(ring); 1467 1468 return 0; 1469} 1470 1471static void 1472gen5_seqno_barrier(struct intel_engine_cs *engine) 1473{ 1474 /* MI_STORE are internally buffered by the GPU and not flushed 1475 * either by MI_FLUSH or SyncFlush or any other combination of 1476 * MI commands. 1477 * 1478 * "Only the submission of the store operation is guaranteed. 1479 * The write result will be complete (coherent) some time later 1480 * (this is practically a finite period but there is no guaranteed 1481 * latency)." 1482 * 1483 * Empirically, we observe that we need a delay of at least 75us to 1484 * be sure that the seqno write is visible by the CPU. 1485 */ 1486 usleep_range(125, 250); 1487} 1488 1489static void 1490gen6_seqno_barrier(struct intel_engine_cs *engine) 1491{ 1492 struct drm_i915_private *dev_priv = engine->i915; 1493 1494 /* Workaround to force correct ordering between irq and seqno writes on 1495 * ivb (and maybe also on snb) by reading from a CS register (like 1496 * ACTHD) before reading the status page. 1497 * 1498 * Note that this effectively stalls the read by the time it takes to 1499 * do a memory transaction, which more or less ensures that the write 1500 * from the GPU has sufficient time to invalidate the CPU cacheline. 1501 * Alternatively we could delay the interrupt from the CS ring to give 1502 * the write time to land, but that would incur a delay after every 1503 * batch i.e. much more frequent than a delay when waiting for the 1504 * interrupt (with the same net latency). 1505 * 1506 * Also note that to prevent whole machine hangs on gen7, we have to 1507 * take the spinlock to guard against concurrent cacheline access. 1508 */ 1509 spin_lock_irq(&dev_priv->uncore.lock); 1510 POSTING_READ_FW(RING_ACTHD(engine->mmio_base)); 1511 spin_unlock_irq(&dev_priv->uncore.lock); 1512} 1513 1514static void 1515gen5_irq_enable(struct intel_engine_cs *engine) 1516{ 1517 gen5_enable_gt_irq(engine->i915, engine->irq_enable_mask); 1518} 1519 1520static void 1521gen5_irq_disable(struct intel_engine_cs *engine) 1522{ 1523 gen5_disable_gt_irq(engine->i915, engine->irq_enable_mask); 1524} 1525 1526static void 1527i9xx_irq_enable(struct intel_engine_cs *engine) 1528{ 1529 struct drm_i915_private *dev_priv = engine->i915; 1530 1531 dev_priv->irq_mask &= ~engine->irq_enable_mask; 1532 I915_WRITE(IMR, dev_priv->irq_mask); 1533 POSTING_READ_FW(RING_IMR(engine->mmio_base)); 1534} 1535 1536static void 1537i9xx_irq_disable(struct intel_engine_cs *engine) 1538{ 1539 struct drm_i915_private *dev_priv = engine->i915; 1540 1541 dev_priv->irq_mask |= engine->irq_enable_mask; 1542 I915_WRITE(IMR, dev_priv->irq_mask); 1543} 1544 1545static void 1546i8xx_irq_enable(struct intel_engine_cs *engine) 1547{ 1548 struct drm_i915_private *dev_priv = engine->i915; 1549 1550 dev_priv->irq_mask &= ~engine->irq_enable_mask; 1551 I915_WRITE16(IMR, dev_priv->irq_mask); 1552 POSTING_READ16(RING_IMR(engine->mmio_base)); 1553} 1554 1555static void 1556i8xx_irq_disable(struct intel_engine_cs *engine) 1557{ 1558 struct drm_i915_private *dev_priv = engine->i915; 1559 1560 dev_priv->irq_mask |= engine->irq_enable_mask; 1561 I915_WRITE16(IMR, dev_priv->irq_mask); 1562} 1563 1564static int 1565bsd_ring_flush(struct drm_i915_gem_request *req, u32 mode) 1566{ 1567 struct intel_ring *ring = req->ring; 1568 int ret; 1569 1570 ret = intel_ring_begin(req, 2); 1571 if (ret) 1572 return ret; 1573 1574 intel_ring_emit(ring, MI_FLUSH); 1575 intel_ring_emit(ring, MI_NOOP); 1576 intel_ring_advance(ring); 1577 return 0; 1578} 1579 1580static void 1581gen6_irq_enable(struct intel_engine_cs *engine) 1582{ 1583 struct drm_i915_private *dev_priv = engine->i915; 1584 1585 I915_WRITE_IMR(engine, 1586 ~(engine->irq_enable_mask | 1587 engine->irq_keep_mask)); 1588 gen5_enable_gt_irq(dev_priv, engine->irq_enable_mask); 1589} 1590 1591static void 1592gen6_irq_disable(struct intel_engine_cs *engine) 1593{ 1594 struct drm_i915_private *dev_priv = engine->i915; 1595 1596 I915_WRITE_IMR(engine, ~engine->irq_keep_mask); 1597 gen5_disable_gt_irq(dev_priv, engine->irq_enable_mask); 1598} 1599 1600static void 1601hsw_vebox_irq_enable(struct intel_engine_cs *engine) 1602{ 1603 struct drm_i915_private *dev_priv = engine->i915; 1604 1605 I915_WRITE_IMR(engine, ~engine->irq_enable_mask); 1606 gen6_unmask_pm_irq(dev_priv, engine->irq_enable_mask); 1607} 1608 1609static void 1610hsw_vebox_irq_disable(struct intel_engine_cs *engine) 1611{ 1612 struct drm_i915_private *dev_priv = engine->i915; 1613 1614 I915_WRITE_IMR(engine, ~0); 1615 gen6_mask_pm_irq(dev_priv, engine->irq_enable_mask); 1616} 1617 1618static void 1619gen8_irq_enable(struct intel_engine_cs *engine) 1620{ 1621 struct drm_i915_private *dev_priv = engine->i915; 1622 1623 I915_WRITE_IMR(engine, 1624 ~(engine->irq_enable_mask | 1625 engine->irq_keep_mask)); 1626 POSTING_READ_FW(RING_IMR(engine->mmio_base)); 1627} 1628 1629static void 1630gen8_irq_disable(struct intel_engine_cs *engine) 1631{ 1632 struct drm_i915_private *dev_priv = engine->i915; 1633 1634 I915_WRITE_IMR(engine, ~engine->irq_keep_mask); 1635} 1636 1637static int 1638i965_emit_bb_start(struct drm_i915_gem_request *req, 1639 u64 offset, u32 length, 1640 unsigned int dispatch_flags) 1641{ 1642 struct intel_ring *ring = req->ring; 1643 int ret; 1644 1645 ret = intel_ring_begin(req, 2); 1646 if (ret) 1647 return ret; 1648 1649 intel_ring_emit(ring, 1650 MI_BATCH_BUFFER_START | 1651 MI_BATCH_GTT | 1652 (dispatch_flags & I915_DISPATCH_SECURE ? 1653 0 : MI_BATCH_NON_SECURE_I965)); 1654 intel_ring_emit(ring, offset); 1655 intel_ring_advance(ring); 1656 1657 return 0; 1658} 1659 1660/* Just userspace ABI convention to limit the wa batch bo to a resonable size */ 1661#define I830_BATCH_LIMIT (256*1024) 1662#define I830_TLB_ENTRIES (2) 1663#define I830_WA_SIZE max(I830_TLB_ENTRIES*4096, I830_BATCH_LIMIT) 1664static int 1665i830_emit_bb_start(struct drm_i915_gem_request *req, 1666 u64 offset, u32 len, 1667 unsigned int dispatch_flags) 1668{ 1669 struct intel_ring *ring = req->ring; 1670 u32 cs_offset = i915_ggtt_offset(req->engine->scratch); 1671 int ret; 1672 1673 ret = intel_ring_begin(req, 6); 1674 if (ret) 1675 return ret; 1676 1677 /* Evict the invalid PTE TLBs */ 1678 intel_ring_emit(ring, COLOR_BLT_CMD | BLT_WRITE_RGBA); 1679 intel_ring_emit(ring, BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | 4096); 1680 intel_ring_emit(ring, I830_TLB_ENTRIES << 16 | 4); /* load each page */ 1681 intel_ring_emit(ring, cs_offset); 1682 intel_ring_emit(ring, 0xdeadbeef); 1683 intel_ring_emit(ring, MI_NOOP); 1684 intel_ring_advance(ring); 1685 1686 if ((dispatch_flags & I915_DISPATCH_PINNED) == 0) { 1687 if (len > I830_BATCH_LIMIT) 1688 return -ENOSPC; 1689 1690 ret = intel_ring_begin(req, 6 + 2); 1691 if (ret) 1692 return ret; 1693 1694 /* Blit the batch (which has now all relocs applied) to the 1695 * stable batch scratch bo area (so that the CS never 1696 * stumbles over its tlb invalidation bug) ... 1697 */ 1698 intel_ring_emit(ring, SRC_COPY_BLT_CMD | BLT_WRITE_RGBA); 1699 intel_ring_emit(ring, 1700 BLT_DEPTH_32 | BLT_ROP_SRC_COPY | 4096); 1701 intel_ring_emit(ring, DIV_ROUND_UP(len, 4096) << 16 | 4096); 1702 intel_ring_emit(ring, cs_offset); 1703 intel_ring_emit(ring, 4096); 1704 intel_ring_emit(ring, offset); 1705 1706 intel_ring_emit(ring, MI_FLUSH); 1707 intel_ring_emit(ring, MI_NOOP); 1708 intel_ring_advance(ring); 1709 1710 /* ... and execute it. */ 1711 offset = cs_offset; 1712 } 1713 1714 ret = intel_ring_begin(req, 2); 1715 if (ret) 1716 return ret; 1717 1718 intel_ring_emit(ring, MI_BATCH_BUFFER_START | MI_BATCH_GTT); 1719 intel_ring_emit(ring, offset | (dispatch_flags & I915_DISPATCH_SECURE ? 1720 0 : MI_BATCH_NON_SECURE)); 1721 intel_ring_advance(ring); 1722 1723 return 0; 1724} 1725 1726static int 1727i915_emit_bb_start(struct drm_i915_gem_request *req, 1728 u64 offset, u32 len, 1729 unsigned int dispatch_flags) 1730{ 1731 struct intel_ring *ring = req->ring; 1732 int ret; 1733 1734 ret = intel_ring_begin(req, 2); 1735 if (ret) 1736 return ret; 1737 1738 intel_ring_emit(ring, MI_BATCH_BUFFER_START | MI_BATCH_GTT); 1739 intel_ring_emit(ring, offset | (dispatch_flags & I915_DISPATCH_SECURE ? 1740 0 : MI_BATCH_NON_SECURE)); 1741 intel_ring_advance(ring); 1742 1743 return 0; 1744} 1745 1746static void cleanup_phys_status_page(struct intel_engine_cs *engine) 1747{ 1748 struct drm_i915_private *dev_priv = engine->i915; 1749 1750 if (!dev_priv->status_page_dmah) 1751 return; 1752 1753 drm_pci_free(&dev_priv->drm, dev_priv->status_page_dmah); 1754 engine->status_page.page_addr = NULL; 1755} 1756 1757static void cleanup_status_page(struct intel_engine_cs *engine) 1758{ 1759 struct i915_vma *vma; 1760 struct drm_i915_gem_object *obj; 1761 1762 vma = fetch_and_zero(&engine->status_page.vma); 1763 if (!vma) 1764 return; 1765 1766 obj = vma->obj; 1767 1768 i915_vma_unpin(vma); 1769 i915_vma_close(vma); 1770 1771 i915_gem_object_unpin_map(obj); 1772 __i915_gem_object_release_unless_active(obj); 1773} 1774 1775static int init_status_page(struct intel_engine_cs *engine) 1776{ 1777 struct drm_i915_gem_object *obj; 1778 struct i915_vma *vma; 1779 unsigned int flags; 1780 void *vaddr; 1781 int ret; 1782 1783 obj = i915_gem_object_create_internal(engine->i915, PAGE_SIZE); 1784 if (IS_ERR(obj)) { 1785 DRM_ERROR("Failed to allocate status page\n"); 1786 return PTR_ERR(obj); 1787 } 1788 1789 ret = i915_gem_object_set_cache_level(obj, I915_CACHE_LLC); 1790 if (ret) 1791 goto err; 1792 1793 vma = i915_vma_instance(obj, &engine->i915->ggtt.base, NULL); 1794 if (IS_ERR(vma)) { 1795 ret = PTR_ERR(vma); 1796 goto err; 1797 } 1798 1799 flags = PIN_GLOBAL; 1800 if (!HAS_LLC(engine->i915)) 1801 /* On g33, we cannot place HWS above 256MiB, so 1802 * restrict its pinning to the low mappable arena. 1803 * Though this restriction is not documented for 1804 * gen4, gen5, or byt, they also behave similarly 1805 * and hang if the HWS is placed at the top of the 1806 * GTT. To generalise, it appears that all !llc 1807 * platforms have issues with us placing the HWS 1808 * above the mappable region (even though we never 1809 * actualy map it). 1810 */ 1811 flags |= PIN_MAPPABLE; 1812 ret = i915_vma_pin(vma, 0, 4096, flags); 1813 if (ret) 1814 goto err; 1815 1816 vaddr = i915_gem_object_pin_map(obj, I915_MAP_WB); 1817 if (IS_ERR(vaddr)) { 1818 ret = PTR_ERR(vaddr); 1819 goto err_unpin; 1820 } 1821 1822 engine->status_page.vma = vma; 1823 engine->status_page.ggtt_offset = i915_ggtt_offset(vma); 1824 engine->status_page.page_addr = memset(vaddr, 0, PAGE_SIZE); 1825 1826 DRM_DEBUG_DRIVER("%s hws offset: 0x%08x\n", 1827 engine->name, i915_ggtt_offset(vma)); 1828 return 0; 1829 1830err_unpin: 1831 i915_vma_unpin(vma); 1832err: 1833 i915_gem_object_put(obj); 1834 return ret; 1835} 1836 1837static int init_phys_status_page(struct intel_engine_cs *engine) 1838{ 1839 struct drm_i915_private *dev_priv = engine->i915; 1840 1841 dev_priv->status_page_dmah = 1842 drm_pci_alloc(&dev_priv->drm, PAGE_SIZE, PAGE_SIZE); 1843 if (!dev_priv->status_page_dmah) 1844 return -ENOMEM; 1845 1846 engine->status_page.page_addr = dev_priv->status_page_dmah->vaddr; 1847 memset(engine->status_page.page_addr, 0, PAGE_SIZE); 1848 1849 return 0; 1850} 1851 1852int intel_ring_pin(struct intel_ring *ring, unsigned int offset_bias) 1853{ 1854 unsigned int flags; 1855 enum i915_map_type map; 1856 struct i915_vma *vma = ring->vma; 1857 void *addr; 1858 int ret; 1859 1860 GEM_BUG_ON(ring->vaddr); 1861 1862 map = HAS_LLC(ring->engine->i915) ? I915_MAP_WB : I915_MAP_WC; 1863 1864 flags = PIN_GLOBAL; 1865 if (offset_bias) 1866 flags |= PIN_OFFSET_BIAS | offset_bias; 1867 if (vma->obj->stolen) 1868 flags |= PIN_MAPPABLE; 1869 1870 if (!(vma->flags & I915_VMA_GLOBAL_BIND)) { 1871 if (flags & PIN_MAPPABLE || map == I915_MAP_WC) 1872 ret = i915_gem_object_set_to_gtt_domain(vma->obj, true); 1873 else 1874 ret = i915_gem_object_set_to_cpu_domain(vma->obj, true); 1875 if (unlikely(ret)) 1876 return ret; 1877 } 1878 1879 ret = i915_vma_pin(vma, 0, PAGE_SIZE, flags); 1880 if (unlikely(ret)) 1881 return ret; 1882 1883 if (i915_vma_is_map_and_fenceable(vma)) 1884 addr = (void __force *)i915_vma_pin_iomap(vma); 1885 else 1886 addr = i915_gem_object_pin_map(vma->obj, map); 1887 if (IS_ERR(addr)) 1888 goto err; 1889 1890 ring->vaddr = addr; 1891 return 0; 1892 1893err: 1894 i915_vma_unpin(vma); 1895 return PTR_ERR(addr); 1896} 1897 1898void intel_ring_unpin(struct intel_ring *ring) 1899{ 1900 GEM_BUG_ON(!ring->vma); 1901 GEM_BUG_ON(!ring->vaddr); 1902 1903 if (i915_vma_is_map_and_fenceable(ring->vma)) 1904 i915_vma_unpin_iomap(ring->vma); 1905 else 1906 i915_gem_object_unpin_map(ring->vma->obj); 1907 ring->vaddr = NULL; 1908 1909 i915_vma_unpin(ring->vma); 1910} 1911 1912static struct i915_vma * 1913intel_ring_create_vma(struct drm_i915_private *dev_priv, int size) 1914{ 1915 struct drm_i915_gem_object *obj; 1916 struct i915_vma *vma; 1917 1918 obj = i915_gem_object_create_stolen(dev_priv, size); 1919 if (!obj) 1920 obj = i915_gem_object_create(dev_priv, size); 1921 if (IS_ERR(obj)) 1922 return ERR_CAST(obj); 1923 1924 /* mark ring buffers as read-only from GPU side by default */ 1925 obj->gt_ro = 1; 1926 1927 vma = i915_vma_instance(obj, &dev_priv->ggtt.base, NULL); 1928 if (IS_ERR(vma)) 1929 goto err; 1930 1931 return vma; 1932 1933err: 1934 i915_gem_object_put(obj); 1935 return vma; 1936} 1937 1938struct intel_ring * 1939intel_engine_create_ring(struct intel_engine_cs *engine, int size) 1940{ 1941 struct intel_ring *ring; 1942 struct i915_vma *vma; 1943 1944 GEM_BUG_ON(!is_power_of_2(size)); 1945 GEM_BUG_ON(RING_CTL_SIZE(size) & ~RING_NR_PAGES); 1946 1947 ring = kzalloc(sizeof(*ring), GFP_KERNEL); 1948 if (!ring) 1949 return ERR_PTR(-ENOMEM); 1950 1951 ring->engine = engine; 1952 1953 INIT_LIST_HEAD(&ring->request_list); 1954 1955 ring->size = size; 1956 /* Workaround an erratum on the i830 which causes a hang if 1957 * the TAIL pointer points to within the last 2 cachelines 1958 * of the buffer. 1959 */ 1960 ring->effective_size = size; 1961 if (IS_I830(engine->i915) || IS_I845G(engine->i915)) 1962 ring->effective_size -= 2 * CACHELINE_BYTES; 1963 1964 ring->last_retired_head = -1; 1965 intel_ring_update_space(ring); 1966 1967 vma = intel_ring_create_vma(engine->i915, size); 1968 if (IS_ERR(vma)) { 1969 kfree(ring); 1970 return ERR_CAST(vma); 1971 } 1972 ring->vma = vma; 1973 1974 return ring; 1975} 1976 1977void 1978intel_ring_free(struct intel_ring *ring) 1979{ 1980 struct drm_i915_gem_object *obj = ring->vma->obj; 1981 1982 i915_vma_close(ring->vma); 1983 __i915_gem_object_release_unless_active(obj); 1984 1985 kfree(ring); 1986} 1987 1988static int context_pin(struct i915_gem_context *ctx, unsigned int flags) 1989{ 1990 struct i915_vma *vma = ctx->engine[RCS].state; 1991 int ret; 1992 1993 /* Clear this page out of any CPU caches for coherent swap-in/out. 1994 * We only want to do this on the first bind so that we do not stall 1995 * on an active context (which by nature is already on the GPU). 1996 */ 1997 if (!(vma->flags & I915_VMA_GLOBAL_BIND)) { 1998 ret = i915_gem_object_set_to_gtt_domain(vma->obj, false); 1999 if (ret) 2000 return ret; 2001 } 2002 2003 return i915_vma_pin(vma, 0, ctx->ggtt_alignment, PIN_GLOBAL | flags); 2004} 2005 2006static int intel_ring_context_pin(struct intel_engine_cs *engine, 2007 struct i915_gem_context *ctx) 2008{ 2009 struct intel_context *ce = &ctx->engine[engine->id]; 2010 int ret; 2011 2012 lockdep_assert_held(&ctx->i915->drm.struct_mutex); 2013 2014 if (ce->pin_count++) 2015 return 0; 2016 2017 if (ce->state) { 2018 unsigned int flags; 2019 2020 flags = 0; 2021 if (i915_gem_context_is_kernel(ctx)) 2022 flags = PIN_HIGH; 2023 2024 ret = context_pin(ctx, flags); 2025 if (ret) 2026 goto error; 2027 2028 ce->state->obj->mm.dirty = true; 2029 } 2030 2031 /* The kernel context is only used as a placeholder for flushing the 2032 * active context. It is never used for submitting user rendering and 2033 * as such never requires the golden render context, and so we can skip 2034 * emitting it when we switch to the kernel context. This is required 2035 * as during eviction we cannot allocate and pin the renderstate in 2036 * order to initialise the context. 2037 */ 2038 if (i915_gem_context_is_kernel(ctx)) 2039 ce->initialised = true; 2040 2041 i915_gem_context_get(ctx); 2042 return 0; 2043 2044error: 2045 ce->pin_count = 0; 2046 return ret; 2047} 2048 2049static void intel_ring_context_unpin(struct intel_engine_cs *engine, 2050 struct i915_gem_context *ctx) 2051{ 2052 struct intel_context *ce = &ctx->engine[engine->id]; 2053 2054 lockdep_assert_held(&ctx->i915->drm.struct_mutex); 2055 GEM_BUG_ON(ce->pin_count == 0); 2056 2057 if (--ce->pin_count) 2058 return; 2059 2060 if (ce->state) 2061 i915_vma_unpin(ce->state); 2062 2063 i915_gem_context_put(ctx); 2064} 2065 2066static int intel_init_ring_buffer(struct intel_engine_cs *engine) 2067{ 2068 struct drm_i915_private *dev_priv = engine->i915; 2069 struct intel_ring *ring; 2070 int ret; 2071 2072 WARN_ON(engine->buffer); 2073 2074 intel_engine_setup_common(engine); 2075 2076 ret = intel_engine_init_common(engine); 2077 if (ret) 2078 goto error; 2079 2080 ring = intel_engine_create_ring(engine, 32 * PAGE_SIZE); 2081 if (IS_ERR(ring)) { 2082 ret = PTR_ERR(ring); 2083 goto error; 2084 } 2085 2086 if (HWS_NEEDS_PHYSICAL(dev_priv)) { 2087 WARN_ON(engine->id != RCS); 2088 ret = init_phys_status_page(engine); 2089 if (ret) 2090 goto error; 2091 } else { 2092 ret = init_status_page(engine); 2093 if (ret) 2094 goto error; 2095 } 2096 2097 /* Ring wraparound at offset 0 sometimes hangs. No idea why. */ 2098 ret = intel_ring_pin(ring, I915_GTT_PAGE_SIZE); 2099 if (ret) { 2100 intel_ring_free(ring); 2101 goto error; 2102 } 2103 engine->buffer = ring; 2104 2105 return 0; 2106 2107error: 2108 intel_engine_cleanup(engine); 2109 return ret; 2110} 2111 2112void intel_engine_cleanup(struct intel_engine_cs *engine) 2113{ 2114 struct drm_i915_private *dev_priv; 2115 2116 dev_priv = engine->i915; 2117 2118 if (engine->buffer) { 2119 WARN_ON(INTEL_GEN(dev_priv) > 2 && 2120 (I915_READ_MODE(engine) & MODE_IDLE) == 0); 2121 2122 intel_ring_unpin(engine->buffer); 2123 intel_ring_free(engine->buffer); 2124 engine->buffer = NULL; 2125 } 2126 2127 if (engine->cleanup) 2128 engine->cleanup(engine); 2129 2130 if (HWS_NEEDS_PHYSICAL(dev_priv)) { 2131 WARN_ON(engine->id != RCS); 2132 cleanup_phys_status_page(engine); 2133 } else { 2134 cleanup_status_page(engine); 2135 } 2136 2137 intel_engine_cleanup_common(engine); 2138 2139 engine->i915 = NULL; 2140 dev_priv->engine[engine->id] = NULL; 2141 kfree(engine); 2142} 2143 2144void intel_legacy_submission_resume(struct drm_i915_private *dev_priv) 2145{ 2146 struct intel_engine_cs *engine; 2147 enum intel_engine_id id; 2148 2149 for_each_engine(engine, dev_priv, id) { 2150 engine->buffer->head = engine->buffer->tail; 2151 engine->buffer->last_retired_head = -1; 2152 } 2153} 2154 2155static int ring_request_alloc(struct drm_i915_gem_request *request) 2156{ 2157 int ret; 2158 2159 GEM_BUG_ON(!request->ctx->engine[request->engine->id].pin_count); 2160 2161 /* Flush enough space to reduce the likelihood of waiting after 2162 * we start building the request - in which case we will just 2163 * have to repeat work. 2164 */ 2165 request->reserved_space += LEGACY_REQUEST_SIZE; 2166 2167 GEM_BUG_ON(!request->engine->buffer); 2168 request->ring = request->engine->buffer; 2169 2170 ret = intel_ring_begin(request, 0); 2171 if (ret) 2172 return ret; 2173 2174 request->reserved_space -= LEGACY_REQUEST_SIZE; 2175 return 0; 2176} 2177 2178static int wait_for_space(struct drm_i915_gem_request *req, int bytes) 2179{ 2180 struct intel_ring *ring = req->ring; 2181 struct drm_i915_gem_request *target; 2182 long timeout; 2183 2184 lockdep_assert_held(&req->i915->drm.struct_mutex); 2185 2186 intel_ring_update_space(ring); 2187 if (ring->space >= bytes) 2188 return 0; 2189 2190 /* 2191 * Space is reserved in the ringbuffer for finalising the request, 2192 * as that cannot be allowed to fail. During request finalisation, 2193 * reserved_space is set to 0 to stop the overallocation and the 2194 * assumption is that then we never need to wait (which has the 2195 * risk of failing with EINTR). 2196 * 2197 * See also i915_gem_request_alloc() and i915_add_request(). 2198 */ 2199 GEM_BUG_ON(!req->reserved_space); 2200 2201 list_for_each_entry(target, &ring->request_list, ring_link) { 2202 unsigned space; 2203 2204 /* Would completion of this request free enough space? */ 2205 space = __intel_ring_space(target->postfix, ring->tail, 2206 ring->size); 2207 if (space >= bytes) 2208 break; 2209 } 2210 2211 if (WARN_ON(&target->ring_link == &ring->request_list)) 2212 return -ENOSPC; 2213 2214 timeout = i915_wait_request(target, 2215 I915_WAIT_INTERRUPTIBLE | I915_WAIT_LOCKED, 2216 MAX_SCHEDULE_TIMEOUT); 2217 if (timeout < 0) 2218 return timeout; 2219 2220 i915_gem_request_retire_upto(target); 2221 2222 intel_ring_update_space(ring); 2223 GEM_BUG_ON(ring->space < bytes); 2224 return 0; 2225} 2226 2227int intel_ring_begin(struct drm_i915_gem_request *req, int num_dwords) 2228{ 2229 struct intel_ring *ring = req->ring; 2230 int remain_actual = ring->size - ring->tail; 2231 int remain_usable = ring->effective_size - ring->tail; 2232 int bytes = num_dwords * sizeof(u32); 2233 int total_bytes, wait_bytes; 2234 bool need_wrap = false; 2235 2236 total_bytes = bytes + req->reserved_space; 2237 2238 if (unlikely(bytes > remain_usable)) { 2239 /* 2240 * Not enough space for the basic request. So need to flush 2241 * out the remainder and then wait for base + reserved. 2242 */ 2243 wait_bytes = remain_actual + total_bytes; 2244 need_wrap = true; 2245 } else if (unlikely(total_bytes > remain_usable)) { 2246 /* 2247 * The base request will fit but the reserved space 2248 * falls off the end. So we don't need an immediate wrap 2249 * and only need to effectively wait for the reserved 2250 * size space from the start of ringbuffer. 2251 */ 2252 wait_bytes = remain_actual + req->reserved_space; 2253 } else { 2254 /* No wrapping required, just waiting. */ 2255 wait_bytes = total_bytes; 2256 } 2257 2258 if (wait_bytes > ring->space) { 2259 int ret = wait_for_space(req, wait_bytes); 2260 if (unlikely(ret)) 2261 return ret; 2262 } 2263 2264 if (unlikely(need_wrap)) { 2265 GEM_BUG_ON(remain_actual > ring->space); 2266 GEM_BUG_ON(ring->tail + remain_actual > ring->size); 2267 2268 /* Fill the tail with MI_NOOP */ 2269 memset(ring->vaddr + ring->tail, 0, remain_actual); 2270 ring->tail = 0; 2271 ring->space -= remain_actual; 2272 } 2273 2274 ring->space -= bytes; 2275 GEM_BUG_ON(ring->space < 0); 2276 return 0; 2277} 2278 2279/* Align the ring tail to a cacheline boundary */ 2280int intel_ring_cacheline_align(struct drm_i915_gem_request *req) 2281{ 2282 struct intel_ring *ring = req->ring; 2283 int num_dwords = 2284 (ring->tail & (CACHELINE_BYTES - 1)) / sizeof(uint32_t); 2285 int ret; 2286 2287 if (num_dwords == 0) 2288 return 0; 2289 2290 num_dwords = CACHELINE_BYTES / sizeof(uint32_t) - num_dwords; 2291 ret = intel_ring_begin(req, num_dwords); 2292 if (ret) 2293 return ret; 2294 2295 while (num_dwords--) 2296 intel_ring_emit(ring, MI_NOOP); 2297 2298 intel_ring_advance(ring); 2299 2300 return 0; 2301} 2302 2303static void gen6_bsd_submit_request(struct drm_i915_gem_request *request) 2304{ 2305 struct drm_i915_private *dev_priv = request->i915; 2306 2307 intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL); 2308 2309 /* Every tail move must follow the sequence below */ 2310 2311 /* Disable notification that the ring is IDLE. The GT 2312 * will then assume that it is busy and bring it out of rc6. 2313 */ 2314 I915_WRITE_FW(GEN6_BSD_SLEEP_PSMI_CONTROL, 2315 _MASKED_BIT_ENABLE(GEN6_BSD_SLEEP_MSG_DISABLE)); 2316 2317 /* Clear the context id. Here be magic! */ 2318 I915_WRITE64_FW(GEN6_BSD_RNCID, 0x0); 2319 2320 /* Wait for the ring not to be idle, i.e. for it to wake up. */ 2321 if (intel_wait_for_register_fw(dev_priv, 2322 GEN6_BSD_SLEEP_PSMI_CONTROL, 2323 GEN6_BSD_SLEEP_INDICATOR, 2324 0, 2325 50)) 2326 DRM_ERROR("timed out waiting for the BSD ring to wake up\n"); 2327 2328 /* Now that the ring is fully powered up, update the tail */ 2329 i9xx_submit_request(request); 2330 2331 /* Let the ring send IDLE messages to the GT again, 2332 * and so let it sleep to conserve power when idle. 2333 */ 2334 I915_WRITE_FW(GEN6_BSD_SLEEP_PSMI_CONTROL, 2335 _MASKED_BIT_DISABLE(GEN6_BSD_SLEEP_MSG_DISABLE)); 2336 2337 intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL); 2338} 2339 2340static int gen6_bsd_ring_flush(struct drm_i915_gem_request *req, u32 mode) 2341{ 2342 struct intel_ring *ring = req->ring; 2343 uint32_t cmd; 2344 int ret; 2345 2346 ret = intel_ring_begin(req, 4); 2347 if (ret) 2348 return ret; 2349 2350 cmd = MI_FLUSH_DW; 2351 if (INTEL_GEN(req->i915) >= 8) 2352 cmd += 1; 2353 2354 /* We always require a command barrier so that subsequent 2355 * commands, such as breadcrumb interrupts, are strictly ordered 2356 * wrt the contents of the write cache being flushed to memory 2357 * (and thus being coherent from the CPU). 2358 */ 2359 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; 2360 2361 /* 2362 * Bspec vol 1c.5 - video engine command streamer: 2363 * "If ENABLED, all TLBs will be invalidated once the flush 2364 * operation is complete. This bit is only valid when the 2365 * Post-Sync Operation field is a value of 1h or 3h." 2366 */ 2367 if (mode & EMIT_INVALIDATE) 2368 cmd |= MI_INVALIDATE_TLB | MI_INVALIDATE_BSD; 2369 2370 intel_ring_emit(ring, cmd); 2371 intel_ring_emit(ring, I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT); 2372 if (INTEL_GEN(req->i915) >= 8) { 2373 intel_ring_emit(ring, 0); /* upper addr */ 2374 intel_ring_emit(ring, 0); /* value */ 2375 } else { 2376 intel_ring_emit(ring, 0); 2377 intel_ring_emit(ring, MI_NOOP); 2378 } 2379 intel_ring_advance(ring); 2380 return 0; 2381} 2382 2383static int 2384gen8_emit_bb_start(struct drm_i915_gem_request *req, 2385 u64 offset, u32 len, 2386 unsigned int dispatch_flags) 2387{ 2388 struct intel_ring *ring = req->ring; 2389 bool ppgtt = USES_PPGTT(req->i915) && 2390 !(dispatch_flags & I915_DISPATCH_SECURE); 2391 int ret; 2392 2393 ret = intel_ring_begin(req, 4); 2394 if (ret) 2395 return ret; 2396 2397 /* FIXME(BDW): Address space and security selectors. */ 2398 intel_ring_emit(ring, MI_BATCH_BUFFER_START_GEN8 | (ppgtt<<8) | 2399 (dispatch_flags & I915_DISPATCH_RS ? 2400 MI_BATCH_RESOURCE_STREAMER : 0)); 2401 intel_ring_emit(ring, lower_32_bits(offset)); 2402 intel_ring_emit(ring, upper_32_bits(offset)); 2403 intel_ring_emit(ring, MI_NOOP); 2404 intel_ring_advance(ring); 2405 2406 return 0; 2407} 2408 2409static int 2410hsw_emit_bb_start(struct drm_i915_gem_request *req, 2411 u64 offset, u32 len, 2412 unsigned int dispatch_flags) 2413{ 2414 struct intel_ring *ring = req->ring; 2415 int ret; 2416 2417 ret = intel_ring_begin(req, 2); 2418 if (ret) 2419 return ret; 2420 2421 intel_ring_emit(ring, 2422 MI_BATCH_BUFFER_START | 2423 (dispatch_flags & I915_DISPATCH_SECURE ? 2424 0 : MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW) | 2425 (dispatch_flags & I915_DISPATCH_RS ? 2426 MI_BATCH_RESOURCE_STREAMER : 0)); 2427 /* bit0-7 is the length on GEN6+ */ 2428 intel_ring_emit(ring, offset); 2429 intel_ring_advance(ring); 2430 2431 return 0; 2432} 2433 2434static int 2435gen6_emit_bb_start(struct drm_i915_gem_request *req, 2436 u64 offset, u32 len, 2437 unsigned int dispatch_flags) 2438{ 2439 struct intel_ring *ring = req->ring; 2440 int ret; 2441 2442 ret = intel_ring_begin(req, 2); 2443 if (ret) 2444 return ret; 2445 2446 intel_ring_emit(ring, 2447 MI_BATCH_BUFFER_START | 2448 (dispatch_flags & I915_DISPATCH_SECURE ? 2449 0 : MI_BATCH_NON_SECURE_I965)); 2450 /* bit0-7 is the length on GEN6+ */ 2451 intel_ring_emit(ring, offset); 2452 intel_ring_advance(ring); 2453 2454 return 0; 2455} 2456 2457/* Blitter support (SandyBridge+) */ 2458 2459static int gen6_ring_flush(struct drm_i915_gem_request *req, u32 mode) 2460{ 2461 struct intel_ring *ring = req->ring; 2462 uint32_t cmd; 2463 int ret; 2464 2465 ret = intel_ring_begin(req, 4); 2466 if (ret) 2467 return ret; 2468 2469 cmd = MI_FLUSH_DW; 2470 if (INTEL_GEN(req->i915) >= 8) 2471 cmd += 1; 2472 2473 /* We always require a command barrier so that subsequent 2474 * commands, such as breadcrumb interrupts, are strictly ordered 2475 * wrt the contents of the write cache being flushed to memory 2476 * (and thus being coherent from the CPU). 2477 */ 2478 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; 2479 2480 /* 2481 * Bspec vol 1c.3 - blitter engine command streamer: 2482 * "If ENABLED, all TLBs will be invalidated once the flush 2483 * operation is complete. This bit is only valid when the 2484 * Post-Sync Operation field is a value of 1h or 3h." 2485 */ 2486 if (mode & EMIT_INVALIDATE) 2487 cmd |= MI_INVALIDATE_TLB; 2488 intel_ring_emit(ring, cmd); 2489 intel_ring_emit(ring, 2490 I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT); 2491 if (INTEL_GEN(req->i915) >= 8) { 2492 intel_ring_emit(ring, 0); /* upper addr */ 2493 intel_ring_emit(ring, 0); /* value */ 2494 } else { 2495 intel_ring_emit(ring, 0); 2496 intel_ring_emit(ring, MI_NOOP); 2497 } 2498 intel_ring_advance(ring); 2499 2500 return 0; 2501} 2502 2503static void intel_ring_init_semaphores(struct drm_i915_private *dev_priv, 2504 struct intel_engine_cs *engine) 2505{ 2506 struct drm_i915_gem_object *obj; 2507 int ret, i; 2508 2509 if (!i915.semaphores) 2510 return; 2511 2512 if (INTEL_GEN(dev_priv) >= 8 && !dev_priv->semaphore) { 2513 struct i915_vma *vma; 2514 2515 obj = i915_gem_object_create(dev_priv, PAGE_SIZE); 2516 if (IS_ERR(obj)) 2517 goto err; 2518 2519 vma = i915_vma_instance(obj, &dev_priv->ggtt.base, NULL); 2520 if (IS_ERR(vma)) 2521 goto err_obj; 2522 2523 ret = i915_gem_object_set_to_gtt_domain(obj, false); 2524 if (ret) 2525 goto err_obj; 2526 2527 ret = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH); 2528 if (ret) 2529 goto err_obj; 2530 2531 dev_priv->semaphore = vma; 2532 } 2533 2534 if (INTEL_GEN(dev_priv) >= 8) { 2535 u32 offset = i915_ggtt_offset(dev_priv->semaphore); 2536 2537 engine->semaphore.sync_to = gen8_ring_sync_to; 2538 engine->semaphore.signal = gen8_xcs_signal; 2539 2540 for (i = 0; i < I915_NUM_ENGINES; i++) { 2541 u32 ring_offset; 2542 2543 if (i != engine->id) 2544 ring_offset = offset + GEN8_SEMAPHORE_OFFSET(engine->id, i); 2545 else 2546 ring_offset = MI_SEMAPHORE_SYNC_INVALID; 2547 2548 engine->semaphore.signal_ggtt[i] = ring_offset; 2549 } 2550 } else if (INTEL_GEN(dev_priv) >= 6) { 2551 engine->semaphore.sync_to = gen6_ring_sync_to; 2552 engine->semaphore.signal = gen6_signal; 2553 2554 /* 2555 * The current semaphore is only applied on pre-gen8 2556 * platform. And there is no VCS2 ring on the pre-gen8 2557 * platform. So the semaphore between RCS and VCS2 is 2558 * initialized as INVALID. Gen8 will initialize the 2559 * sema between VCS2 and RCS later. 2560 */ 2561 for (i = 0; i < GEN6_NUM_SEMAPHORES; i++) { 2562 static const struct { 2563 u32 wait_mbox; 2564 i915_reg_t mbox_reg; 2565 } sem_data[GEN6_NUM_SEMAPHORES][GEN6_NUM_SEMAPHORES] = { 2566 [RCS_HW] = { 2567 [VCS_HW] = { .wait_mbox = MI_SEMAPHORE_SYNC_RV, .mbox_reg = GEN6_VRSYNC }, 2568 [BCS_HW] = { .wait_mbox = MI_SEMAPHORE_SYNC_RB, .mbox_reg = GEN6_BRSYNC }, 2569 [VECS_HW] = { .wait_mbox = MI_SEMAPHORE_SYNC_RVE, .mbox_reg = GEN6_VERSYNC }, 2570 }, 2571 [VCS_HW] = { 2572 [RCS_HW] = { .wait_mbox = MI_SEMAPHORE_SYNC_VR, .mbox_reg = GEN6_RVSYNC }, 2573 [BCS_HW] = { .wait_mbox = MI_SEMAPHORE_SYNC_VB, .mbox_reg = GEN6_BVSYNC }, 2574 [VECS_HW] = { .wait_mbox = MI_SEMAPHORE_SYNC_VVE, .mbox_reg = GEN6_VEVSYNC }, 2575 }, 2576 [BCS_HW] = { 2577 [RCS_HW] = { .wait_mbox = MI_SEMAPHORE_SYNC_BR, .mbox_reg = GEN6_RBSYNC }, 2578 [VCS_HW] = { .wait_mbox = MI_SEMAPHORE_SYNC_BV, .mbox_reg = GEN6_VBSYNC }, 2579 [VECS_HW] = { .wait_mbox = MI_SEMAPHORE_SYNC_BVE, .mbox_reg = GEN6_VEBSYNC }, 2580 }, 2581 [VECS_HW] = { 2582 [RCS_HW] = { .wait_mbox = MI_SEMAPHORE_SYNC_VER, .mbox_reg = GEN6_RVESYNC }, 2583 [VCS_HW] = { .wait_mbox = MI_SEMAPHORE_SYNC_VEV, .mbox_reg = GEN6_VVESYNC }, 2584 [BCS_HW] = { .wait_mbox = MI_SEMAPHORE_SYNC_VEB, .mbox_reg = GEN6_BVESYNC }, 2585 }, 2586 }; 2587 u32 wait_mbox; 2588 i915_reg_t mbox_reg; 2589 2590 if (i == engine->hw_id) { 2591 wait_mbox = MI_SEMAPHORE_SYNC_INVALID; 2592 mbox_reg = GEN6_NOSYNC; 2593 } else { 2594 wait_mbox = sem_data[engine->hw_id][i].wait_mbox; 2595 mbox_reg = sem_data[engine->hw_id][i].mbox_reg; 2596 } 2597 2598 engine->semaphore.mbox.wait[i] = wait_mbox; 2599 engine->semaphore.mbox.signal[i] = mbox_reg; 2600 } 2601 } 2602 2603 return; 2604 2605err_obj: 2606 i915_gem_object_put(obj); 2607err: 2608 DRM_DEBUG_DRIVER("Failed to allocate space for semaphores, disabling\n"); 2609 i915.semaphores = 0; 2610} 2611 2612static void intel_ring_init_irq(struct drm_i915_private *dev_priv, 2613 struct intel_engine_cs *engine) 2614{ 2615 engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << engine->irq_shift; 2616 2617 if (INTEL_GEN(dev_priv) >= 8) { 2618 engine->irq_enable = gen8_irq_enable; 2619 engine->irq_disable = gen8_irq_disable; 2620 engine->irq_seqno_barrier = gen6_seqno_barrier; 2621 } else if (INTEL_GEN(dev_priv) >= 6) { 2622 engine->irq_enable = gen6_irq_enable; 2623 engine->irq_disable = gen6_irq_disable; 2624 engine->irq_seqno_barrier = gen6_seqno_barrier; 2625 } else if (INTEL_GEN(dev_priv) >= 5) { 2626 engine->irq_enable = gen5_irq_enable; 2627 engine->irq_disable = gen5_irq_disable; 2628 engine->irq_seqno_barrier = gen5_seqno_barrier; 2629 } else if (INTEL_GEN(dev_priv) >= 3) { 2630 engine->irq_enable = i9xx_irq_enable; 2631 engine->irq_disable = i9xx_irq_disable; 2632 } else { 2633 engine->irq_enable = i8xx_irq_enable; 2634 engine->irq_disable = i8xx_irq_disable; 2635 } 2636} 2637 2638static void intel_ring_default_vfuncs(struct drm_i915_private *dev_priv, 2639 struct intel_engine_cs *engine) 2640{ 2641 intel_ring_init_irq(dev_priv, engine); 2642 intel_ring_init_semaphores(dev_priv, engine); 2643 2644 engine->init_hw = init_ring_common; 2645 engine->reset_hw = reset_ring_common; 2646 2647 engine->context_pin = intel_ring_context_pin; 2648 engine->context_unpin = intel_ring_context_unpin; 2649 2650 engine->request_alloc = ring_request_alloc; 2651 2652 engine->emit_breadcrumb = i9xx_emit_breadcrumb; 2653 engine->emit_breadcrumb_sz = i9xx_emit_breadcrumb_sz; 2654 if (i915.semaphores) { 2655 int num_rings; 2656 2657 engine->emit_breadcrumb = gen6_sema_emit_breadcrumb; 2658 2659 num_rings = hweight32(INTEL_INFO(dev_priv)->ring_mask) - 1; 2660 if (INTEL_GEN(dev_priv) >= 8) { 2661 engine->emit_breadcrumb_sz += num_rings * 6; 2662 } else { 2663 engine->emit_breadcrumb_sz += num_rings * 3; 2664 if (num_rings & 1) 2665 engine->emit_breadcrumb_sz++; 2666 } 2667 } 2668 engine->submit_request = i9xx_submit_request; 2669 2670 if (INTEL_GEN(dev_priv) >= 8) 2671 engine->emit_bb_start = gen8_emit_bb_start; 2672 else if (INTEL_GEN(dev_priv) >= 6) 2673 engine->emit_bb_start = gen6_emit_bb_start; 2674 else if (INTEL_GEN(dev_priv) >= 4) 2675 engine->emit_bb_start = i965_emit_bb_start; 2676 else if (IS_I830(dev_priv) || IS_I845G(dev_priv)) 2677 engine->emit_bb_start = i830_emit_bb_start; 2678 else 2679 engine->emit_bb_start = i915_emit_bb_start; 2680} 2681 2682int intel_init_render_ring_buffer(struct intel_engine_cs *engine) 2683{ 2684 struct drm_i915_private *dev_priv = engine->i915; 2685 int ret; 2686 2687 intel_ring_default_vfuncs(dev_priv, engine); 2688 2689 if (HAS_L3_DPF(dev_priv)) 2690 engine->irq_keep_mask = GT_RENDER_L3_PARITY_ERROR_INTERRUPT; 2691 2692 if (INTEL_GEN(dev_priv) >= 8) { 2693 engine->init_context = intel_rcs_ctx_init; 2694 engine->emit_breadcrumb = gen8_render_emit_breadcrumb; 2695 engine->emit_breadcrumb_sz = gen8_render_emit_breadcrumb_sz; 2696 engine->emit_flush = gen8_render_ring_flush; 2697 if (i915.semaphores) { 2698 int num_rings; 2699 2700 engine->semaphore.signal = gen8_rcs_signal; 2701 2702 num_rings = 2703 hweight32(INTEL_INFO(dev_priv)->ring_mask) - 1; 2704 engine->emit_breadcrumb_sz += num_rings * 6; 2705 } 2706 } else if (INTEL_GEN(dev_priv) >= 6) { 2707 engine->init_context = intel_rcs_ctx_init; 2708 engine->emit_flush = gen7_render_ring_flush; 2709 if (IS_GEN6(dev_priv)) 2710 engine->emit_flush = gen6_render_ring_flush; 2711 } else if (IS_GEN5(dev_priv)) { 2712 engine->emit_flush = gen4_render_ring_flush; 2713 } else { 2714 if (INTEL_GEN(dev_priv) < 4) 2715 engine->emit_flush = gen2_render_ring_flush; 2716 else 2717 engine->emit_flush = gen4_render_ring_flush; 2718 engine->irq_enable_mask = I915_USER_INTERRUPT; 2719 } 2720 2721 if (IS_HASWELL(dev_priv)) 2722 engine->emit_bb_start = hsw_emit_bb_start; 2723 2724 engine->init_hw = init_render_ring; 2725 engine->cleanup = render_ring_cleanup; 2726 2727 ret = intel_init_ring_buffer(engine); 2728 if (ret) 2729 return ret; 2730 2731 if (INTEL_GEN(dev_priv) >= 6) { 2732 ret = intel_engine_create_scratch(engine, PAGE_SIZE); 2733 if (ret) 2734 return ret; 2735 } else if (HAS_BROKEN_CS_TLB(dev_priv)) { 2736 ret = intel_engine_create_scratch(engine, I830_WA_SIZE); 2737 if (ret) 2738 return ret; 2739 } 2740 2741 return 0; 2742} 2743 2744int intel_init_bsd_ring_buffer(struct intel_engine_cs *engine) 2745{ 2746 struct drm_i915_private *dev_priv = engine->i915; 2747 2748 intel_ring_default_vfuncs(dev_priv, engine); 2749 2750 if (INTEL_GEN(dev_priv) >= 6) { 2751 /* gen6 bsd needs a special wa for tail updates */ 2752 if (IS_GEN6(dev_priv)) 2753 engine->submit_request = gen6_bsd_submit_request; 2754 engine->emit_flush = gen6_bsd_ring_flush; 2755 if (INTEL_GEN(dev_priv) < 8) 2756 engine->irq_enable_mask = GT_BSD_USER_INTERRUPT; 2757 } else { 2758 engine->mmio_base = BSD_RING_BASE; 2759 engine->emit_flush = bsd_ring_flush; 2760 if (IS_GEN5(dev_priv)) 2761 engine->irq_enable_mask = ILK_BSD_USER_INTERRUPT; 2762 else 2763 engine->irq_enable_mask = I915_BSD_USER_INTERRUPT; 2764 } 2765 2766 return intel_init_ring_buffer(engine); 2767} 2768 2769/** 2770 * Initialize the second BSD ring (eg. Broadwell GT3, Skylake GT3) 2771 */ 2772int intel_init_bsd2_ring_buffer(struct intel_engine_cs *engine) 2773{ 2774 struct drm_i915_private *dev_priv = engine->i915; 2775 2776 intel_ring_default_vfuncs(dev_priv, engine); 2777 2778 engine->emit_flush = gen6_bsd_ring_flush; 2779 2780 return intel_init_ring_buffer(engine); 2781} 2782 2783int intel_init_blt_ring_buffer(struct intel_engine_cs *engine) 2784{ 2785 struct drm_i915_private *dev_priv = engine->i915; 2786 2787 intel_ring_default_vfuncs(dev_priv, engine); 2788 2789 engine->emit_flush = gen6_ring_flush; 2790 if (INTEL_GEN(dev_priv) < 8) 2791 engine->irq_enable_mask = GT_BLT_USER_INTERRUPT; 2792 2793 return intel_init_ring_buffer(engine); 2794} 2795 2796int intel_init_vebox_ring_buffer(struct intel_engine_cs *engine) 2797{ 2798 struct drm_i915_private *dev_priv = engine->i915; 2799 2800 intel_ring_default_vfuncs(dev_priv, engine); 2801 2802 engine->emit_flush = gen6_ring_flush; 2803 2804 if (INTEL_GEN(dev_priv) < 8) { 2805 engine->irq_enable_mask = PM_VEBOX_USER_INTERRUPT; 2806 engine->irq_enable = hsw_vebox_irq_enable; 2807 engine->irq_disable = hsw_vebox_irq_disable; 2808 } 2809 2810 return intel_init_ring_buffer(engine); 2811}