+4
-6
drivers/gpu/drm/drm_gpuva_mgr.c
+4
-6
drivers/gpu/drm/drm_gpuva_mgr.c
···
1076
1076
u64 req_addr, u64 req_range,
1077
1077
struct drm_gem_object *req_obj, u64 req_offset)
1078
1078
{
1079
-
struct drm_gpuva *va, *next, *prev = NULL;
1079
+
struct drm_gpuva *va, *next;
1080
1080
u64 req_end = req_addr + req_range;
1081
1081
int ret;
1082
1082
···
1106
1106
ret = op_unmap_cb(ops, priv, va, merge);
1107
1107
if (ret)
1108
1108
return ret;
1109
-
goto next;
1109
+
continue;
1110
1110
}
1111
1111
1112
1112
if (end > req_end) {
···
1151
1151
ret = op_remap_cb(ops, priv, &p, NULL, &u);
1152
1152
if (ret)
1153
1153
return ret;
1154
-
goto next;
1154
+
continue;
1155
1155
}
1156
1156
1157
1157
if (end > req_end) {
···
1184
1184
ret = op_unmap_cb(ops, priv, va, merge);
1185
1185
if (ret)
1186
1186
return ret;
1187
-
goto next;
1187
+
continue;
1188
1188
}
1189
1189
1190
1190
if (end > req_end) {
···
1205
1205
break;
1206
1206
}
1207
1207
}
1208
-
next:
1209
-
prev = va;
1210
1208
}
1211
1209
1212
1210
return op_map_cb(ops, priv,
+5
-2
drivers/gpu/drm/nouveau/nouveau_dma.c
+5
-2
drivers/gpu/drm/nouveau/nouveau_dma.c
···
69
69
}
70
70
71
71
void
72
-
nv50_dma_push(struct nouveau_channel *chan, u64 offset, int length)
72
+
nv50_dma_push(struct nouveau_channel *chan, u64 offset, u32 length,
73
+
bool no_prefetch)
73
74
{
74
75
struct nvif_user *user = &chan->drm->client.device.user;
75
76
struct nouveau_bo *pb = chan->push.buffer;
76
77
int ip = (chan->dma.ib_put * 2) + chan->dma.ib_base;
77
78
78
79
BUG_ON(chan->dma.ib_free < 1);
80
+
WARN_ON(length > NV50_DMA_PUSH_MAX_LENGTH);
79
81
80
82
nouveau_bo_wr32(pb, ip++, lower_32_bits(offset));
81
-
nouveau_bo_wr32(pb, ip++, upper_32_bits(offset) | length << 8);
83
+
nouveau_bo_wr32(pb, ip++, upper_32_bits(offset) | length << 8 |
84
+
(no_prefetch ? (1 << 31) : 0));
82
85
83
86
chan->dma.ib_put = (chan->dma.ib_put + 1) & chan->dma.ib_max;
84
87
+6
-2
drivers/gpu/drm/nouveau/nouveau_dma.h
+6
-2
drivers/gpu/drm/nouveau/nouveau_dma.h
···
31
31
#include "nouveau_chan.h"
32
32
33
33
int nouveau_dma_wait(struct nouveau_channel *, int slots, int size);
34
-
void nv50_dma_push(struct nouveau_channel *, u64 addr, int length);
34
+
void nv50_dma_push(struct nouveau_channel *, u64 addr, u32 length,
35
+
bool no_prefetch);
35
36
36
37
/*
37
38
* There's a hw race condition where you can't jump to your PUT offset,
···
45
44
* bytes so we need a larger SKIPS value.
46
45
*/
47
46
#define NOUVEAU_DMA_SKIPS (128 / 4)
47
+
48
+
/* Maximum push buffer size. */
49
+
#define NV50_DMA_PUSH_MAX_LENGTH 0x7fffff
48
50
49
51
/* Object handles - for stuff that's doesn't use handle == oclass. */
50
52
enum {
···
93
89
94
90
if (chan->dma.ib_max) {
95
91
nv50_dma_push(chan, chan->push.addr + (chan->dma.put << 2),
96
-
(chan->dma.cur - chan->dma.put) << 2);
92
+
(chan->dma.cur - chan->dma.put) << 2, false);
97
93
} else {
98
94
WRITE_PUT(chan->dma.cur);
99
95
}
+16
-3
drivers/gpu/drm/nouveau/nouveau_exec.c
+16
-3
drivers/gpu/drm/nouveau/nouveau_exec.c
···
164
164
}
165
165
166
166
for (i = 0; i < exec_job->push.count; i++) {
167
-
nv50_dma_push(chan, exec_job->push.s[i].va,
168
-
exec_job->push.s[i].va_len);
167
+
struct drm_nouveau_exec_push *p = &exec_job->push.s[i];
168
+
bool no_prefetch = p->flags & DRM_NOUVEAU_EXEC_PUSH_NO_PREFETCH;
169
+
170
+
nv50_dma_push(chan, p->va, p->va_len, no_prefetch);
169
171
}
170
172
171
173
ret = nouveau_fence_emit(fence, chan);
···
225
223
{
226
224
struct nouveau_exec_job *job;
227
225
struct nouveau_job_args args = {};
228
-
int ret;
226
+
int i, ret;
227
+
228
+
for (i = 0; i < __args->push.count; i++) {
229
+
struct drm_nouveau_exec_push *p = &__args->push.s[i];
230
+
231
+
if (unlikely(p->va_len > NV50_DMA_PUSH_MAX_LENGTH)) {
232
+
NV_PRINTK(err, nouveau_cli(__args->file_priv),
233
+
"pushbuf size exceeds limit: 0x%x max 0x%x\n",
234
+
p->va_len, NV50_DMA_PUSH_MAX_LENGTH);
235
+
return -EINVAL;
236
+
}
237
+
}
229
238
230
239
job = *pjob = kzalloc(sizeof(*job), GFP_KERNEL);
231
240
if (!job)
+4
-2
drivers/gpu/drm/nouveau/nouveau_gem.c
+4
-2
drivers/gpu/drm/nouveau/nouveau_gem.c
···
856
856
for (i = 0; i < req->nr_push; i++) {
857
857
struct nouveau_vma *vma = (void *)(unsigned long)
858
858
bo[push[i].bo_index].user_priv;
859
+
u64 addr = vma->addr + push[i].offset;
860
+
u32 length = push[i].length & ~NOUVEAU_GEM_PUSHBUF_NO_PREFETCH;
861
+
bool no_prefetch = push[i].length & NOUVEAU_GEM_PUSHBUF_NO_PREFETCH;
859
862
860
-
nv50_dma_push(chan, vma->addr + push[i].offset,
861
-
push[i].length);
863
+
nv50_dma_push(chan, addr, length, no_prefetch);
862
864
}
863
865
} else
864
866
if (drm->client.device.info.chipset >= 0x25) {
+22
drivers/gpu/drm/nouveau/nouveau_sched.c
+22
drivers/gpu/drm/nouveau/nouveau_sched.c
···
292
292
if (job->sync)
293
293
done_fence = dma_fence_get(job->done_fence);
294
294
295
+
/* If a sched job depends on a dma-fence from a job from the same GPU
296
+
* scheduler instance, but a different scheduler entity, the GPU
297
+
* scheduler does only wait for the particular job to be scheduled,
298
+
* rather than for the job to fully complete. This is due to the GPU
299
+
* scheduler assuming that there is a scheduler instance per ring.
300
+
* However, the current implementation, in order to avoid arbitrary
301
+
* amounts of kthreads, has a single scheduler instance while scheduler
302
+
* entities represent rings.
303
+
*
304
+
* As a workaround, set the DRM_SCHED_FENCE_DONT_PIPELINE for all
305
+
* out-fences in order to force the scheduler to wait for full job
306
+
* completion for dependent jobs from different entities and same
307
+
* scheduler instance.
308
+
*
309
+
* There is some work in progress [1] to address the issues of firmware
310
+
* schedulers; once it is in-tree the scheduler topology in Nouveau
311
+
* should be re-worked accordingly.
312
+
*
313
+
* [1] https://lore.kernel.org/dri-devel/20230801205103.627779-1-matthew.brost@intel.com/
314
+
*/
315
+
set_bit(DRM_SCHED_FENCE_DONT_PIPELINE, &job->done_fence->flags);
316
+
295
317
if (job->ops->armed_submit)
296
318
job->ops->armed_submit(job);
297
319
+1
drivers/gpu/drm/nouveau/nouveau_uvmm.c
+1
drivers/gpu/drm/nouveau/nouveau_uvmm.c
+1
-1
drivers/gpu/drm/tests/drm_kunit_helpers.c
+1
-1
drivers/gpu/drm/tests/drm_kunit_helpers.c
+2
-2
drivers/gpu/drm/ttm/tests/ttm_pool_test.c
+2
-2
drivers/gpu/drm/ttm/tests/ttm_pool_test.c
···
228
228
dma1 = tt->dma_address[0];
229
229
dma2 = tt->dma_address[tt->num_pages - 1];
230
230
231
-
KUNIT_ASSERT_NOT_NULL(test, (void *)dma1);
232
-
KUNIT_ASSERT_NOT_NULL(test, (void *)dma2);
231
+
KUNIT_ASSERT_NOT_NULL(test, (void *)(uintptr_t)dma1);
232
+
KUNIT_ASSERT_NOT_NULL(test, (void *)(uintptr_t)dma2);
233
233
234
234
ttm_pool_free(pool, tt);
235
235
ttm_tt_fini(tt);
+7
-1
include/uapi/drm/nouveau_drm.h
+7
-1
include/uapi/drm/nouveau_drm.h
···
138
138
__u32 pad;
139
139
__u64 offset;
140
140
__u64 length;
141
+
#define NOUVEAU_GEM_PUSHBUF_NO_PREFETCH (1 << 23)
141
142
};
142
143
143
144
struct drm_nouveau_gem_pushbuf {
···
339
338
/**
340
339
* @va_len: the length of the push buffer mapping
341
340
*/
342
-
__u64 va_len;
341
+
__u32 va_len;
342
+
/**
343
+
* @flags: the flags for this push buffer mapping
344
+
*/
345
+
__u32 flags;
346
+
#define DRM_NOUVEAU_EXEC_PUSH_NO_PREFETCH 0x1
343
347
};
344
348
345
349
/**