commit bc609f4867f6a14db0efda55a7adef4dca16762e · tjh.dev/kernel

+4 -6

drivers/gpu/drm/drm_gpuva_mgr.c

··· 1076 1076 u64 req_addr, u64 req_range, 1077 1077 struct drm_gem_object *req_obj, u64 req_offset) 1078 1078 { 1079 - struct drm_gpuva *va, *next, *prev = NULL; 1079 + struct drm_gpuva *va, *next; 1080 1080 u64 req_end = req_addr + req_range; 1081 1081 int ret; 1082 1082 ··· 1106 1106 ret = op_unmap_cb(ops, priv, va, merge); 1107 1107 if (ret) 1108 1108 return ret; 1109 - goto next; 1109 + continue; 1110 1110 } 1111 1111 1112 1112 if (end > req_end) { ··· 1151 1151 ret = op_remap_cb(ops, priv, &p, NULL, &u); 1152 1152 if (ret) 1153 1153 return ret; 1154 - goto next; 1154 + continue; 1155 1155 } 1156 1156 1157 1157 if (end > req_end) { ··· 1184 1184 ret = op_unmap_cb(ops, priv, va, merge); 1185 1185 if (ret) 1186 1186 return ret; 1187 - goto next; 1187 + continue; 1188 1188 } 1189 1189 1190 1190 if (end > req_end) { ··· 1205 1205 break; 1206 1206 } 1207 1207 } 1208 - next: 1209 - prev = va; 1210 1208 } 1211 1209 1212 1210 return op_map_cb(ops, priv,

+5 -2

drivers/gpu/drm/nouveau/nouveau_dma.c

··· 69 69 } 70 70 71 71 void 72 - nv50_dma_push(struct nouveau_channel *chan, u64 offset, int length) 72 + nv50_dma_push(struct nouveau_channel *chan, u64 offset, u32 length, 73 + bool no_prefetch) 73 74 { 74 75 struct nvif_user *user = &chan->drm->client.device.user; 75 76 struct nouveau_bo *pb = chan->push.buffer; 76 77 int ip = (chan->dma.ib_put * 2) + chan->dma.ib_base; 77 78 78 79 BUG_ON(chan->dma.ib_free < 1); 80 + WARN_ON(length > NV50_DMA_PUSH_MAX_LENGTH); 79 81 80 82 nouveau_bo_wr32(pb, ip++, lower_32_bits(offset)); 81 - nouveau_bo_wr32(pb, ip++, upper_32_bits(offset) | length << 8); 83 + nouveau_bo_wr32(pb, ip++, upper_32_bits(offset) | length << 8 | 84 + (no_prefetch ? (1 << 31) : 0)); 82 85 83 86 chan->dma.ib_put = (chan->dma.ib_put + 1) & chan->dma.ib_max; 84 87

+6 -2

drivers/gpu/drm/nouveau/nouveau_dma.h

··· 31 31 #include "nouveau_chan.h" 32 32 33 33 int nouveau_dma_wait(struct nouveau_channel *, int slots, int size); 34 - void nv50_dma_push(struct nouveau_channel *, u64 addr, int length); 34 + void nv50_dma_push(struct nouveau_channel *, u64 addr, u32 length, 35 + bool no_prefetch); 35 36 36 37 /* 37 38 * There's a hw race condition where you can't jump to your PUT offset, ··· 45 44 * bytes so we need a larger SKIPS value. 46 45 */ 47 46 #define NOUVEAU_DMA_SKIPS (128 / 4) 47 + 48 + /* Maximum push buffer size. */ 49 + #define NV50_DMA_PUSH_MAX_LENGTH 0x7fffff 48 50 49 51 /* Object handles - for stuff that's doesn't use handle == oclass. */ 50 52 enum { ··· 93 89 94 90 if (chan->dma.ib_max) { 95 91 nv50_dma_push(chan, chan->push.addr + (chan->dma.put << 2), 96 - (chan->dma.cur - chan->dma.put) << 2); 92 + (chan->dma.cur - chan->dma.put) << 2, false); 97 93 } else { 98 94 WRITE_PUT(chan->dma.cur); 99 95 }

+16 -3

drivers/gpu/drm/nouveau/nouveau_exec.c

··· 164 164 } 165 165 166 166 for (i = 0; i < exec_job->push.count; i++) { 167 - nv50_dma_push(chan, exec_job->push.s[i].va, 168 - exec_job->push.s[i].va_len); 167 + struct drm_nouveau_exec_push *p = &exec_job->push.s[i]; 168 + bool no_prefetch = p->flags & DRM_NOUVEAU_EXEC_PUSH_NO_PREFETCH; 169 + 170 + nv50_dma_push(chan, p->va, p->va_len, no_prefetch); 169 171 } 170 172 171 173 ret = nouveau_fence_emit(fence, chan); ··· 225 223 { 226 224 struct nouveau_exec_job *job; 227 225 struct nouveau_job_args args = {}; 228 - int ret; 226 + int i, ret; 227 + 228 + for (i = 0; i < __args->push.count; i++) { 229 + struct drm_nouveau_exec_push *p = &__args->push.s[i]; 230 + 231 + if (unlikely(p->va_len > NV50_DMA_PUSH_MAX_LENGTH)) { 232 + NV_PRINTK(err, nouveau_cli(__args->file_priv), 233 + "pushbuf size exceeds limit: 0x%x max 0x%x\n", 234 + p->va_len, NV50_DMA_PUSH_MAX_LENGTH); 235 + return -EINVAL; 236 + } 237 + } 229 238 230 239 job = *pjob = kzalloc(sizeof(*job), GFP_KERNEL); 231 240 if (!job)

+4 -2

drivers/gpu/drm/nouveau/nouveau_gem.c

··· 856 856 for (i = 0; i < req->nr_push; i++) { 857 857 struct nouveau_vma *vma = (void *)(unsigned long) 858 858 bo[push[i].bo_index].user_priv; 859 + u64 addr = vma->addr + push[i].offset; 860 + u32 length = push[i].length & ~NOUVEAU_GEM_PUSHBUF_NO_PREFETCH; 861 + bool no_prefetch = push[i].length & NOUVEAU_GEM_PUSHBUF_NO_PREFETCH; 859 862 860 - nv50_dma_push(chan, vma->addr + push[i].offset, 861 - push[i].length); 863 + nv50_dma_push(chan, addr, length, no_prefetch); 862 864 } 863 865 } else 864 866 if (drm->client.device.info.chipset >= 0x25) {

+22

drivers/gpu/drm/nouveau/nouveau_sched.c

··· 292 292 if (job->sync) 293 293 done_fence = dma_fence_get(job->done_fence); 294 294 295 + /* If a sched job depends on a dma-fence from a job from the same GPU 296 + * scheduler instance, but a different scheduler entity, the GPU 297 + * scheduler does only wait for the particular job to be scheduled, 298 + * rather than for the job to fully complete. This is due to the GPU 299 + * scheduler assuming that there is a scheduler instance per ring. 300 + * However, the current implementation, in order to avoid arbitrary 301 + * amounts of kthreads, has a single scheduler instance while scheduler 302 + * entities represent rings. 303 + * 304 + * As a workaround, set the DRM_SCHED_FENCE_DONT_PIPELINE for all 305 + * out-fences in order to force the scheduler to wait for full job 306 + * completion for dependent jobs from different entities and same 307 + * scheduler instance. 308 + * 309 + * There is some work in progress [1] to address the issues of firmware 310 + * schedulers; once it is in-tree the scheduler topology in Nouveau 311 + * should be re-worked accordingly. 312 + * 313 + * [1] https://lore.kernel.org/dri-devel/20230801205103.627779-1-matthew.brost@intel.com/ 314 + */ 315 + set_bit(DRM_SCHED_FENCE_DONT_PIPELINE, &job->done_fence->flags); 316 + 295 317 if (job->ops->armed_submit) 296 318 job->ops->armed_submit(job); 297 319

+1

drivers/gpu/drm/nouveau/nouveau_uvmm.c

··· 639 639 struct drm_gpuva *va = r->unmap->va; 640 640 struct uvmm_map_args remap_args = { 641 641 .kind = uvma_from_va(va)->kind, 642 + .region = uvma_from_va(va)->region, 642 643 }; 643 644 u64 ustart = va->va.addr; 644 645 u64 urange = va->va.range;

+1 -1

drivers/gpu/drm/tests/drm_kunit_helpers.c

··· 156 156 } 157 157 158 158 /** 159 - * drm_kunit_helper_context_alloc - Allocates an acquire context 159 + * drm_kunit_helper_acquire_ctx_alloc - Allocates an acquire context 160 160 * @test: The test context object 161 161 * 162 162 * Allocates and initializes a modeset acquire context.

+2 -2

drivers/gpu/drm/ttm/tests/ttm_pool_test.c

··· 228 228 dma1 = tt->dma_address[0]; 229 229 dma2 = tt->dma_address[tt->num_pages - 1]; 230 230 231 - KUNIT_ASSERT_NOT_NULL(test, (void *)dma1); 232 - KUNIT_ASSERT_NOT_NULL(test, (void *)dma2); 231 + KUNIT_ASSERT_NOT_NULL(test, (void *)(uintptr_t)dma1); 232 + KUNIT_ASSERT_NOT_NULL(test, (void *)(uintptr_t)dma2); 233 233 234 234 ttm_pool_free(pool, tt); 235 235 ttm_tt_fini(tt);

+7 -1

include/uapi/drm/nouveau_drm.h

··· 138 138 __u32 pad; 139 139 __u64 offset; 140 140 __u64 length; 141 + #define NOUVEAU_GEM_PUSHBUF_NO_PREFETCH (1 << 23) 141 142 }; 142 143 143 144 struct drm_nouveau_gem_pushbuf { ··· 339 338 /** 340 339 * @va_len: the length of the push buffer mapping 341 340 */ 342 - __u64 va_len; 341 + __u32 va_len; 342 + /** 343 + * @flags: the flags for this push buffer mapping 344 + */ 345 + __u32 flags; 346 + #define DRM_NOUVEAU_EXEC_PUSH_NO_PREFETCH 0x1 343 347 }; 344 348 345 349 /**