drm/i915: Restore missing command flush before interrupt on BLT ring

We always skipped flushing the BLT ring if the request flush did not
include the RENDER domain. However, this neglects that we try to flush
the COMMAND domain after every batch and before the breadcrumb interrupt
(to make sure the batch is indeed completed prior to the interrupt
firing and so insuring CPU coherency). As a result of the missing flush,
incoherency did indeed creep in, most notable when using lots of command
buffers and so potentially rewritting an active command buffer (i.e.
the GPU was still executing from it even though the following interrupt
had already fired and the request/buffer retired).

As all ring->flush routines now have the same preconditions, de-duplicate
and move those checks up into i915_gem_flush_ring().

Fixes gem_linear_blit.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=35284
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Tested-by: mengmeng.meng@intel.com

+57 -63
+6 -1
drivers/gpu/drm/i915/i915_gem.c
··· 2219 { 2220 int ret; 2221 2222 trace_i915_gem_ring_flush(ring, invalidate_domains, flush_domains); 2223 2224 ret = ring->flush(ring, invalidate_domains, flush_domains); 2225 if (ret) 2226 return ret; 2227 2228 - i915_gem_process_flushing_list(ring, flush_domains); 2229 return 0; 2230 } 2231
··· 2219 { 2220 int ret; 2221 2222 + if (((invalidate_domains | flush_domains) & I915_GEM_GPU_DOMAINS) == 0) 2223 + return 0; 2224 + 2225 trace_i915_gem_ring_flush(ring, invalidate_domains, flush_domains); 2226 2227 ret = ring->flush(ring, invalidate_domains, flush_domains); 2228 if (ret) 2229 return ret; 2230 2231 + if (flush_domains & I915_GEM_GPU_DOMAINS) 2232 + i915_gem_process_flushing_list(ring, flush_domains); 2233 + 2234 return 0; 2235 } 2236
+51 -62
drivers/gpu/drm/i915/intel_ringbuffer.c
··· 65 u32 cmd; 66 int ret; 67 68 - if ((invalidate_domains | flush_domains) & I915_GEM_GPU_DOMAINS) { 69 /* 70 - * read/write caches: 71 - * 72 - * I915_GEM_DOMAIN_RENDER is always invalidated, but is 73 - * only flushed if MI_NO_WRITE_FLUSH is unset. On 965, it is 74 - * also flushed at 2d versus 3d pipeline switches. 75 - * 76 - * read-only caches: 77 - * 78 - * I915_GEM_DOMAIN_SAMPLER is flushed on pre-965 if 79 - * MI_READ_FLUSH is set, and is always flushed on 965. 80 - * 81 - * I915_GEM_DOMAIN_COMMAND may not exist? 82 - * 83 - * I915_GEM_DOMAIN_INSTRUCTION, which exists on 965, is 84 - * invalidated when MI_EXE_FLUSH is set. 85 - * 86 - * I915_GEM_DOMAIN_VERTEX, which exists on 965, is 87 - * invalidated with every MI_FLUSH. 88 - * 89 - * TLBs: 90 - * 91 - * On 965, TLBs associated with I915_GEM_DOMAIN_COMMAND 92 - * and I915_GEM_DOMAIN_CPU in are invalidated at PTE write and 93 - * I915_GEM_DOMAIN_RENDER and I915_GEM_DOMAIN_SAMPLER 94 - * are flushed at any MI_FLUSH. 95 */ 96 - 97 - cmd = MI_FLUSH | MI_NO_WRITE_FLUSH; 98 - if ((invalidate_domains|flush_domains) & 99 - I915_GEM_DOMAIN_RENDER) 100 - cmd &= ~MI_NO_WRITE_FLUSH; 101 - if (INTEL_INFO(dev)->gen < 4) { 102 - /* 103 - * On the 965, the sampler cache always gets flushed 104 - * and this bit is reserved. 105 - */ 106 - if (invalidate_domains & I915_GEM_DOMAIN_SAMPLER) 107 - cmd |= MI_READ_FLUSH; 108 - } 109 - if (invalidate_domains & I915_GEM_DOMAIN_INSTRUCTION) 110 - cmd |= MI_EXE_FLUSH; 111 - 112 - if (invalidate_domains & I915_GEM_DOMAIN_COMMAND && 113 - (IS_G4X(dev) || IS_GEN5(dev))) 114 - cmd |= MI_INVALIDATE_ISP; 115 - 116 - ret = intel_ring_begin(ring, 2); 117 - if (ret) 118 - return ret; 119 - 120 - intel_ring_emit(ring, cmd); 121 - intel_ring_emit(ring, MI_NOOP); 122 - intel_ring_advance(ring); 123 } 124 125 return 0; 126 } ··· 565 u32 flush_domains) 566 { 567 int ret; 568 - 569 - if ((flush_domains & I915_GEM_DOMAIN_RENDER) == 0) 570 - return 0; 571 572 ret = intel_ring_begin(ring, 2); 573 if (ret) ··· 1051 uint32_t cmd; 1052 int ret; 1053 1054 - if (((invalidate | flush) & I915_GEM_GPU_DOMAINS) == 0) 1055 - return 0; 1056 - 1057 ret = intel_ring_begin(ring, 4); 1058 if (ret) 1059 return ret; ··· 1221 { 1222 uint32_t cmd; 1223 int ret; 1224 - 1225 - if (((invalidate | flush) & I915_GEM_DOMAIN_RENDER) == 0) 1226 - return 0; 1227 1228 ret = blt_ring_begin(ring, 4); 1229 if (ret)
··· 65 u32 cmd; 66 int ret; 67 68 + /* 69 + * read/write caches: 70 + * 71 + * I915_GEM_DOMAIN_RENDER is always invalidated, but is 72 + * only flushed if MI_NO_WRITE_FLUSH is unset. On 965, it is 73 + * also flushed at 2d versus 3d pipeline switches. 74 + * 75 + * read-only caches: 76 + * 77 + * I915_GEM_DOMAIN_SAMPLER is flushed on pre-965 if 78 + * MI_READ_FLUSH is set, and is always flushed on 965. 79 + * 80 + * I915_GEM_DOMAIN_COMMAND may not exist? 81 + * 82 + * I915_GEM_DOMAIN_INSTRUCTION, which exists on 965, is 83 + * invalidated when MI_EXE_FLUSH is set. 84 + * 85 + * I915_GEM_DOMAIN_VERTEX, which exists on 965, is 86 + * invalidated with every MI_FLUSH. 87 + * 88 + * TLBs: 89 + * 90 + * On 965, TLBs associated with I915_GEM_DOMAIN_COMMAND 91 + * and I915_GEM_DOMAIN_CPU in are invalidated at PTE write and 92 + * I915_GEM_DOMAIN_RENDER and I915_GEM_DOMAIN_SAMPLER 93 + * are flushed at any MI_FLUSH. 94 + */ 95 + 96 + cmd = MI_FLUSH | MI_NO_WRITE_FLUSH; 97 + if ((invalidate_domains|flush_domains) & 98 + I915_GEM_DOMAIN_RENDER) 99 + cmd &= ~MI_NO_WRITE_FLUSH; 100 + if (INTEL_INFO(dev)->gen < 4) { 101 /* 102 + * On the 965, the sampler cache always gets flushed 103 + * and this bit is reserved. 104 */ 105 + if (invalidate_domains & I915_GEM_DOMAIN_SAMPLER) 106 + cmd |= MI_READ_FLUSH; 107 } 108 + if (invalidate_domains & I915_GEM_DOMAIN_INSTRUCTION) 109 + cmd |= MI_EXE_FLUSH; 110 + 111 + if (invalidate_domains & I915_GEM_DOMAIN_COMMAND && 112 + (IS_G4X(dev) || IS_GEN5(dev))) 113 + cmd |= MI_INVALIDATE_ISP; 114 + 115 + ret = intel_ring_begin(ring, 2); 116 + if (ret) 117 + return ret; 118 + 119 + intel_ring_emit(ring, cmd); 120 + intel_ring_emit(ring, MI_NOOP); 121 + intel_ring_advance(ring); 122 123 return 0; 124 } ··· 567 u32 flush_domains) 568 { 569 int ret; 570 571 ret = intel_ring_begin(ring, 2); 572 if (ret) ··· 1056 uint32_t cmd; 1057 int ret; 1058 1059 ret = intel_ring_begin(ring, 4); 1060 if (ret) 1061 return ret; ··· 1229 { 1230 uint32_t cmd; 1231 int ret; 1232 1233 ret = blt_ring_begin(ring, 4); 1234 if (ret)