Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'drm-intel-next-fixes-2022-08-11' of git://anongit.freedesktop.org/drm/drm-intel into drm-fixes

- disable pci resize on 32-bit systems (Nirmoy)
- don't leak the ccs state (Matt)
- TLB invalidation fixes (Chris)
[now with all fixes of fixes]

Signed-off-by: Dave Airlie <airlied@redhat.com>

From: Rodrigo Vivi <rodrigo.vivi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/YvVumNCga+90fYN0@intel.com

+183 -55
+4 -12
drivers/gpu/drm/i915/gem/i915_gem_object.c
··· 268 268 */ 269 269 void __i915_gem_object_pages_fini(struct drm_i915_gem_object *obj) 270 270 { 271 - assert_object_held(obj); 271 + assert_object_held_shared(obj); 272 272 273 273 if (!list_empty(&obj->vma.list)) { 274 274 struct i915_vma *vma; ··· 331 331 continue; 332 332 } 333 333 334 - if (!i915_gem_object_trylock(obj, NULL)) { 335 - /* busy, toss it back to the pile */ 336 - if (llist_add(&obj->freed, &i915->mm.free_list)) 337 - queue_delayed_work(i915->wq, &i915->mm.free_work, msecs_to_jiffies(10)); 338 - continue; 339 - } 340 - 341 334 __i915_gem_object_pages_fini(obj); 342 - i915_gem_object_unlock(obj); 343 335 __i915_gem_free_object(obj); 344 336 345 337 /* But keep the pointer alive for RCU-protected lookups */ ··· 351 359 static void __i915_gem_free_work(struct work_struct *work) 352 360 { 353 361 struct drm_i915_private *i915 = 354 - container_of(work, struct drm_i915_private, mm.free_work.work); 362 + container_of(work, struct drm_i915_private, mm.free_work); 355 363 356 364 i915_gem_flush_free_objects(i915); 357 365 } ··· 383 391 */ 384 392 385 393 if (llist_add(&obj->freed, &i915->mm.free_list)) 386 - queue_delayed_work(i915->wq, &i915->mm.free_work, 0); 394 + queue_work(i915->wq, &i915->mm.free_work); 387 395 } 388 396 389 397 void __i915_gem_object_flush_frontbuffer(struct drm_i915_gem_object *obj, ··· 737 745 738 746 void i915_gem_init__objects(struct drm_i915_private *i915) 739 747 { 740 - INIT_DELAYED_WORK(&i915->mm.free_work, __i915_gem_free_work); 748 + INIT_WORK(&i915->mm.free_work, __i915_gem_free_work); 741 749 } 742 750 743 751 void i915_objects_module_exit(void)
+2 -1
drivers/gpu/drm/i915/gem/i915_gem_object_types.h
··· 335 335 #define I915_BO_READONLY BIT(7) 336 336 #define I915_TILING_QUIRK_BIT 8 /* unknown swizzling; do not release! */ 337 337 #define I915_BO_PROTECTED BIT(9) 338 - #define I915_BO_WAS_BOUND_BIT 10 339 338 /** 340 339 * @mem_flags - Mutable placement-related flags 341 340 * ··· 615 616 * pages were last acquired. 616 617 */ 617 618 bool dirty:1; 619 + 620 + u32 tlb; 618 621 } mm; 619 622 620 623 struct {
+16 -9
drivers/gpu/drm/i915/gem/i915_gem_pages.c
··· 6 6 7 7 #include <drm/drm_cache.h> 8 8 9 + #include "gt/intel_gt.h" 10 + #include "gt/intel_gt_pm.h" 11 + 9 12 #include "i915_drv.h" 10 13 #include "i915_gem_object.h" 11 14 #include "i915_scatterlist.h" 12 15 #include "i915_gem_lmem.h" 13 16 #include "i915_gem_mman.h" 14 - 15 - #include "gt/intel_gt.h" 16 17 17 18 void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj, 18 19 struct sg_table *pages, ··· 191 190 vunmap(ptr); 192 191 } 193 192 193 + static void flush_tlb_invalidate(struct drm_i915_gem_object *obj) 194 + { 195 + struct drm_i915_private *i915 = to_i915(obj->base.dev); 196 + struct intel_gt *gt = to_gt(i915); 197 + 198 + if (!obj->mm.tlb) 199 + return; 200 + 201 + intel_gt_invalidate_tlb(gt, obj->mm.tlb); 202 + obj->mm.tlb = 0; 203 + } 204 + 194 205 struct sg_table * 195 206 __i915_gem_object_unset_pages(struct drm_i915_gem_object *obj) 196 207 { ··· 228 215 __i915_gem_object_reset_page_iter(obj); 229 216 obj->mm.page_sizes.phys = obj->mm.page_sizes.sg = 0; 230 217 231 - if (test_and_clear_bit(I915_BO_WAS_BOUND_BIT, &obj->flags)) { 232 - struct drm_i915_private *i915 = to_i915(obj->base.dev); 233 - intel_wakeref_t wakeref; 234 - 235 - with_intel_runtime_pm_if_active(&i915->runtime_pm, wakeref) 236 - intel_gt_invalidate_tlbs(to_gt(i915)); 237 - } 218 + flush_tlb_invalidate(obj); 238 219 239 220 return pages; 240 221 }
+60 -17
drivers/gpu/drm/i915/gt/intel_gt.c
··· 11 11 #include "pxp/intel_pxp.h" 12 12 13 13 #include "i915_drv.h" 14 + #include "i915_perf_oa_regs.h" 14 15 #include "intel_context.h" 16 + #include "intel_engine_pm.h" 15 17 #include "intel_engine_regs.h" 16 18 #include "intel_ggtt_gmch.h" 17 19 #include "intel_gt.h" ··· 38 36 { 39 37 spin_lock_init(&gt->irq_lock); 40 38 41 - mutex_init(&gt->tlb_invalidate_lock); 42 - 43 39 INIT_LIST_HEAD(&gt->closed_vma); 44 40 spin_lock_init(&gt->closed_lock); 45 41 ··· 48 48 intel_gt_init_reset(gt); 49 49 intel_gt_init_requests(gt); 50 50 intel_gt_init_timelines(gt); 51 + mutex_init(&gt->tlb.invalidate_lock); 52 + seqcount_mutex_init(&gt->tlb.seqno, &gt->tlb.invalidate_lock); 51 53 intel_gt_pm_init_early(gt); 52 54 53 55 intel_uc_init_early(&gt->uc); ··· 770 768 intel_gt_fini_requests(gt); 771 769 intel_gt_fini_reset(gt); 772 770 intel_gt_fini_timelines(gt); 771 + mutex_destroy(&gt->tlb.invalidate_lock); 773 772 intel_engines_free(gt); 774 773 } 775 774 } ··· 909 906 return rb; 910 907 } 911 908 912 - void intel_gt_invalidate_tlbs(struct intel_gt *gt) 909 + static void mmio_invalidate_full(struct intel_gt *gt) 913 910 { 914 911 static const i915_reg_t gen8_regs[] = { 915 912 [RENDER_CLASS] = GEN8_RTCR, ··· 927 924 struct drm_i915_private *i915 = gt->i915; 928 925 struct intel_uncore *uncore = gt->uncore; 929 926 struct intel_engine_cs *engine; 927 + intel_engine_mask_t awake, tmp; 930 928 enum intel_engine_id id; 931 929 const i915_reg_t *regs; 932 930 unsigned int num = 0; 933 - 934 - if (I915_SELFTEST_ONLY(gt->awake == -ENODEV)) 935 - return; 936 931 937 932 if (GRAPHICS_VER(i915) == 12) { 938 933 regs = gen12_regs; ··· 946 945 "Platform does not implement TLB invalidation!")) 947 946 return; 948 947 949 - GEM_TRACE("\n"); 950 - 951 - assert_rpm_wakelock_held(&i915->runtime_pm); 952 - 953 - mutex_lock(&gt->tlb_invalidate_lock); 954 948 intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL); 955 949 956 950 spin_lock_irq(&uncore->lock); /* serialise invalidate with GT reset */ 957 951 952 + awake = 0; 958 953 for_each_engine(engine, gt, id) { 959 954 struct reg_and_bit rb; 955 + 956 + if (!intel_engine_pm_is_awake(engine)) 957 + continue; 960 958 961 959 rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num); 962 960 if (!i915_mmio_reg_offset(rb.reg)) 963 961 continue; 964 962 965 963 intel_uncore_write_fw(uncore, rb.reg, rb.bit); 964 + awake |= engine->mask; 966 965 } 966 + 967 + GT_TRACE(gt, "invalidated engines %08x\n", awake); 968 + 969 + /* Wa_2207587034:tgl,dg1,rkl,adl-s,adl-p */ 970 + if (awake && 971 + (IS_TIGERLAKE(i915) || 972 + IS_DG1(i915) || 973 + IS_ROCKETLAKE(i915) || 974 + IS_ALDERLAKE_S(i915) || 975 + IS_ALDERLAKE_P(i915))) 976 + intel_uncore_write_fw(uncore, GEN12_OA_TLB_INV_CR, 1); 967 977 968 978 spin_unlock_irq(&uncore->lock); 969 979 970 - for_each_engine(engine, gt, id) { 980 + for_each_engine_masked(engine, gt, awake, tmp) { 981 + struct reg_and_bit rb; 982 + 971 983 /* 972 984 * HW architecture suggest typical invalidation time at 40us, 973 985 * with pessimistic cases up to 100us and a recommendation to ··· 988 974 */ 989 975 const unsigned int timeout_us = 100; 990 976 const unsigned int timeout_ms = 4; 991 - struct reg_and_bit rb; 992 977 993 978 rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num); 994 - if (!i915_mmio_reg_offset(rb.reg)) 995 - continue; 996 - 997 979 if (__intel_wait_for_register_fw(uncore, 998 980 rb.reg, rb.bit, 0, 999 981 timeout_us, timeout_ms, ··· 1006 996 * transitions. 1007 997 */ 1008 998 intel_uncore_forcewake_put_delayed(uncore, FORCEWAKE_ALL); 1009 - mutex_unlock(&gt->tlb_invalidate_lock); 999 + } 1000 + 1001 + static bool tlb_seqno_passed(const struct intel_gt *gt, u32 seqno) 1002 + { 1003 + u32 cur = intel_gt_tlb_seqno(gt); 1004 + 1005 + /* Only skip if a *full* TLB invalidate barrier has passed */ 1006 + return (s32)(cur - ALIGN(seqno, 2)) > 0; 1007 + } 1008 + 1009 + void intel_gt_invalidate_tlb(struct intel_gt *gt, u32 seqno) 1010 + { 1011 + intel_wakeref_t wakeref; 1012 + 1013 + if (I915_SELFTEST_ONLY(gt->awake == -ENODEV)) 1014 + return; 1015 + 1016 + if (intel_gt_is_wedged(gt)) 1017 + return; 1018 + 1019 + if (tlb_seqno_passed(gt, seqno)) 1020 + return; 1021 + 1022 + with_intel_gt_pm_if_awake(gt, wakeref) { 1023 + mutex_lock(&gt->tlb.invalidate_lock); 1024 + if (tlb_seqno_passed(gt, seqno)) 1025 + goto unlock; 1026 + 1027 + mmio_invalidate_full(gt); 1028 + 1029 + write_seqcount_invalidate(&gt->tlb.seqno); 1030 + unlock: 1031 + mutex_unlock(&gt->tlb.invalidate_lock); 1032 + } 1010 1033 }
+11 -1
drivers/gpu/drm/i915/gt/intel_gt.h
··· 101 101 102 102 void intel_gt_watchdog_work(struct work_struct *work); 103 103 104 - void intel_gt_invalidate_tlbs(struct intel_gt *gt); 104 + static inline u32 intel_gt_tlb_seqno(const struct intel_gt *gt) 105 + { 106 + return seqprop_sequence(&gt->tlb.seqno); 107 + } 108 + 109 + static inline u32 intel_gt_next_invalidate_tlb_full(const struct intel_gt *gt) 110 + { 111 + return intel_gt_tlb_seqno(gt) | 1; 112 + } 113 + 114 + void intel_gt_invalidate_tlb(struct intel_gt *gt, u32 seqno); 105 115 106 116 #endif /* __INTEL_GT_H__ */
+3
drivers/gpu/drm/i915/gt/intel_gt_pm.h
··· 55 55 for (tmp = 1, intel_gt_pm_get(gt); tmp; \ 56 56 intel_gt_pm_put(gt), tmp = 0) 57 57 58 + #define with_intel_gt_pm_if_awake(gt, wf) \ 59 + for (wf = intel_gt_pm_get_if_awake(gt); wf; intel_gt_pm_put_async(gt), wf = 0) 60 + 58 61 static inline int intel_gt_pm_wait_for_idle(struct intel_gt *gt) 59 62 { 60 63 return intel_wakeref_wait_for_idle(&gt->wakeref);
+17 -1
drivers/gpu/drm/i915/gt/intel_gt_types.h
··· 11 11 #include <linux/llist.h> 12 12 #include <linux/mutex.h> 13 13 #include <linux/notifier.h> 14 + #include <linux/seqlock.h> 14 15 #include <linux/spinlock.h> 15 16 #include <linux/types.h> 16 17 #include <linux/workqueue.h> ··· 84 83 struct intel_uc uc; 85 84 struct intel_gsc gsc; 86 85 87 - struct mutex tlb_invalidate_lock; 86 + struct { 87 + /* Serialize global tlb invalidations */ 88 + struct mutex invalidate_lock; 89 + 90 + /* 91 + * Batch TLB invalidations 92 + * 93 + * After unbinding the PTE, we need to ensure the TLB 94 + * are invalidated prior to releasing the physical pages. 95 + * But we only need one such invalidation for all unbinds, 96 + * so we track how many TLB invalidations have been 97 + * performed since unbind the PTE and only emit an extra 98 + * invalidate if no full barrier has been passed. 99 + */ 100 + seqcount_mutex_t seqno; 101 + } tlb; 88 102 89 103 struct i915_wa_list wa_list; 90 104
+22 -1
drivers/gpu/drm/i915/gt/intel_migrate.c
··· 708 708 u8 src_access, dst_access; 709 709 struct i915_request *rq; 710 710 int src_sz, dst_sz; 711 - bool ccs_is_src; 711 + bool ccs_is_src, overwrite_ccs; 712 712 int err; 713 713 714 714 GEM_BUG_ON(ce->vm != ce->engine->gt->migrate.context->vm); ··· 748 748 if (ccs_bytes_to_cpy) 749 749 get_ccs_sg_sgt(&it_ccs, bytes_to_cpy); 750 750 } 751 + 752 + overwrite_ccs = HAS_FLAT_CCS(i915) && !ccs_bytes_to_cpy && dst_is_lmem; 751 753 752 754 src_offset = 0; 753 755 dst_offset = CHUNK_SZ; ··· 854 852 if (err) 855 853 goto out_rq; 856 854 ccs_bytes_to_cpy -= ccs_sz; 855 + } else if (overwrite_ccs) { 856 + err = rq->engine->emit_flush(rq, EMIT_INVALIDATE); 857 + if (err) 858 + goto out_rq; 859 + 860 + /* 861 + * While we can't always restore/manage the CCS state, 862 + * we still need to ensure we don't leak the CCS state 863 + * from the previous user, so make sure we overwrite it 864 + * with something. 865 + */ 866 + err = emit_copy_ccs(rq, dst_offset, INDIRECT_ACCESS, 867 + dst_offset, DIRECT_ACCESS, len); 868 + if (err) 869 + goto out_rq; 870 + 871 + err = rq->engine->emit_flush(rq, EMIT_INVALIDATE); 872 + if (err) 873 + goto out_rq; 857 874 } 858 875 859 876 /* Arbitration is re-enabled between requests. */
+6 -2
drivers/gpu/drm/i915/gt/intel_ppgtt.c
··· 206 206 void ppgtt_unbind_vma(struct i915_address_space *vm, 207 207 struct i915_vma_resource *vma_res) 208 208 { 209 - if (vma_res->allocated) 210 - vm->clear_range(vm, vma_res->start, vma_res->vma_size); 209 + if (!vma_res->allocated) 210 + return; 211 + 212 + vm->clear_range(vm, vma_res->start, vma_res->vma_size); 213 + if (vma_res->tlb) 214 + vma_invalidate_tlb(vm, vma_res->tlb); 211 215 } 212 216 213 217 static unsigned long pd_count(u64 size, int shift)
+4
drivers/gpu/drm/i915/gt/intel_region_lmem.c
··· 15 15 #include "gt/intel_gt_mcr.h" 16 16 #include "gt/intel_gt_regs.h" 17 17 18 + #ifdef CONFIG_64BIT 18 19 static void _release_bars(struct pci_dev *pdev) 19 20 { 20 21 int resno; ··· 112 111 pci_assign_unassigned_bus_resources(pdev->bus); 113 112 pci_write_config_dword(pdev, PCI_COMMAND, pci_cmd); 114 113 } 114 + #else 115 + static void i915_resize_lmem_bar(struct drm_i915_private *i915, resource_size_t lmem_size) {} 116 + #endif 115 117 116 118 static int 117 119 region_lmem_release(struct intel_memory_region *mem)
+2 -2
drivers/gpu/drm/i915/i915_drv.h
··· 247 247 * List of objects which are pending destruction. 248 248 */ 249 249 struct llist_head free_list; 250 - struct delayed_work free_work; 250 + struct work_struct free_work; 251 251 /** 252 252 * Count of objects pending destructions. Used to skip needlessly 253 253 * waiting on an RCU barrier if no objects are waiting to be freed. ··· 1378 1378 * armed the work again. 1379 1379 */ 1380 1380 while (atomic_read(&i915->mm.free_count)) { 1381 - flush_delayed_work(&i915->mm.free_work); 1381 + flush_work(&i915->mm.free_work); 1382 1382 flush_delayed_work(&i915->bdev.wq); 1383 1383 rcu_barrier(); 1384 1384 }
+26 -7
drivers/gpu/drm/i915/i915_vma.c
··· 538 538 bind_flags); 539 539 } 540 540 541 - set_bit(I915_BO_WAS_BOUND_BIT, &vma->obj->flags); 542 - 543 541 atomic_or(bind_flags, &vma->flags); 544 542 return 0; 545 543 } ··· 1308 1310 return err; 1309 1311 } 1310 1312 1313 + void vma_invalidate_tlb(struct i915_address_space *vm, u32 *tlb) 1314 + { 1315 + /* 1316 + * Before we release the pages that were bound by this vma, we 1317 + * must invalidate all the TLBs that may still have a reference 1318 + * back to our physical address. It only needs to be done once, 1319 + * so after updating the PTE to point away from the pages, record 1320 + * the most recent TLB invalidation seqno, and if we have not yet 1321 + * flushed the TLBs upon release, perform a full invalidation. 1322 + */ 1323 + WRITE_ONCE(*tlb, intel_gt_next_invalidate_tlb_full(vm->gt)); 1324 + } 1325 + 1311 1326 static void __vma_put_pages(struct i915_vma *vma, unsigned int count) 1312 1327 { 1313 1328 /* We allocate under vma_get_pages, so beware the shrinker */ ··· 1952 1941 vma->vm->skip_pte_rewrite; 1953 1942 trace_i915_vma_unbind(vma); 1954 1943 1955 - unbind_fence = i915_vma_resource_unbind(vma_res); 1944 + if (async) 1945 + unbind_fence = i915_vma_resource_unbind(vma_res, 1946 + &vma->obj->mm.tlb); 1947 + else 1948 + unbind_fence = i915_vma_resource_unbind(vma_res, NULL); 1949 + 1956 1950 vma->resource = NULL; 1957 1951 1958 1952 atomic_and(~(I915_VMA_BIND_MASK | I915_VMA_ERROR | I915_VMA_GGTT_WRITE), ··· 1965 1949 1966 1950 i915_vma_detach(vma); 1967 1951 1968 - if (!async && unbind_fence) { 1969 - dma_fence_wait(unbind_fence, false); 1970 - dma_fence_put(unbind_fence); 1971 - unbind_fence = NULL; 1952 + if (!async) { 1953 + if (unbind_fence) { 1954 + dma_fence_wait(unbind_fence, false); 1955 + dma_fence_put(unbind_fence); 1956 + unbind_fence = NULL; 1957 + } 1958 + vma_invalidate_tlb(vma->vm, &vma->obj->mm.tlb); 1972 1959 } 1973 1960 1974 1961 /*
+1
drivers/gpu/drm/i915/i915_vma.h
··· 213 213 u64 size, u64 alignment, u64 flags); 214 214 void __i915_vma_set_map_and_fenceable(struct i915_vma *vma); 215 215 void i915_vma_revoke_mmap(struct i915_vma *vma); 216 + void vma_invalidate_tlb(struct i915_address_space *vm, u32 *tlb); 216 217 struct dma_fence *__i915_vma_evict(struct i915_vma *vma, bool async); 217 218 int __i915_vma_unbind(struct i915_vma *vma); 218 219 int __must_check i915_vma_unbind(struct i915_vma *vma);
+4 -1
drivers/gpu/drm/i915/i915_vma_resource.c
··· 223 223 * Return: A refcounted pointer to a dma-fence that signals when unbinding is 224 224 * complete. 225 225 */ 226 - struct dma_fence *i915_vma_resource_unbind(struct i915_vma_resource *vma_res) 226 + struct dma_fence *i915_vma_resource_unbind(struct i915_vma_resource *vma_res, 227 + u32 *tlb) 227 228 { 228 229 struct i915_address_space *vm = vma_res->vm; 230 + 231 + vma_res->tlb = tlb; 229 232 230 233 /* Reference for the sw fence */ 231 234 i915_vma_resource_get(vma_res);
+5 -1
drivers/gpu/drm/i915/i915_vma_resource.h
··· 67 67 * taken when the unbind is scheduled. 68 68 * @skip_pte_rewrite: During ggtt suspend and vm takedown pte rewriting 69 69 * needs to be skipped for unbind. 70 + * @tlb: pointer for obj->mm.tlb, if async unbind. Otherwise, NULL 70 71 * 71 72 * The lifetime of a struct i915_vma_resource is from a binding request to 72 73 * the actual possible asynchronous unbind has completed. ··· 120 119 bool immediate_unbind:1; 121 120 bool needs_wakeref:1; 122 121 bool skip_pte_rewrite:1; 122 + 123 + u32 *tlb; 123 124 }; 124 125 125 126 bool i915_vma_resource_hold(struct i915_vma_resource *vma_res, ··· 134 131 135 132 void i915_vma_resource_free(struct i915_vma_resource *vma_res); 136 133 137 - struct dma_fence *i915_vma_resource_unbind(struct i915_vma_resource *vma_res); 134 + struct dma_fence *i915_vma_resource_unbind(struct i915_vma_resource *vma_res, 135 + u32 *tlb); 138 136 139 137 void __i915_vma_resource_init(struct i915_vma_resource *vma_res); 140 138