Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

gpu: host1x: Wait prefences outside MLOCK

The current submission opcode sequence first takes the engine MLOCK,
and then switches to HOST1X class to wait prefences. This is fine
while we only use a single channel per engine and there is no
virtualization, since jobs are serialized on that one channel anyway.
However, when that assumption doesn't hold, we are keeping the
engine locked while not running anything on it while waiting for
prefences to complete.

To resolve this, execute wait commands in the beginning of the job
outside the engine MLOCK. We still take the HOST1X MLOCK because
recent hardware requires register opcodes to be executed within some
MLOCK, but the hardware also allows unlimited channels to take the
HOST1X MLOCK at the same time.

Signed-off-by: Mikko Perttunen <mperttunen@nvidia.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
Link: https://lore.kernel.org/r/20250708-host1x-wait-prefences-outside-mlock-v1-1-13e98044e35a@nvidia.com

authored by

Mikko Perttunen and committed by
Thierry Reding
63d47cc6 c7d39326

+69 -43
+69 -43
drivers/gpu/host1x/hw/channel_hw.c
··· 47 47 } 48 48 } 49 49 50 - static void submit_wait(struct host1x_job *job, u32 id, u32 threshold, 51 - u32 next_class) 50 + static void submit_wait(struct host1x_job *job, u32 id, u32 threshold) 51 + { 52 + struct host1x_cdma *cdma = &job->channel->cdma; 53 + 54 + #if HOST1X_HW >= 2 55 + host1x_cdma_push_wide(cdma, 56 + host1x_opcode_setclass( 57 + HOST1X_CLASS_HOST1X, 58 + HOST1X_UCLASS_LOAD_SYNCPT_PAYLOAD_32, 59 + /* WAIT_SYNCPT_32 is at SYNCPT_PAYLOAD_32+2 */ 60 + BIT(0) | BIT(2) 61 + ), 62 + threshold, 63 + id, 64 + HOST1X_OPCODE_NOP 65 + ); 66 + #else 67 + /* TODO add waitchk or use waitbases or other mitigation */ 68 + host1x_cdma_push(cdma, 69 + host1x_opcode_setclass( 70 + HOST1X_CLASS_HOST1X, 71 + host1x_uclass_wait_syncpt_r(), 72 + BIT(0) 73 + ), 74 + host1x_class_host_wait_syncpt(id, threshold) 75 + ); 76 + #endif 77 + } 78 + 79 + static void submit_setclass(struct host1x_job *job, u32 next_class) 52 80 { 53 81 struct host1x_cdma *cdma = &job->channel->cdma; 54 82 ··· 94 66 stream_id = job->engine_fallback_streamid; 95 67 96 68 host1x_cdma_push_wide(cdma, 97 - host1x_opcode_setclass( 98 - HOST1X_CLASS_HOST1X, 99 - HOST1X_UCLASS_LOAD_SYNCPT_PAYLOAD_32, 100 - /* WAIT_SYNCPT_32 is at SYNCPT_PAYLOAD_32+2 */ 101 - BIT(0) | BIT(2) 102 - ), 103 - threshold, 104 - id, 105 - HOST1X_OPCODE_NOP 106 - ); 107 - host1x_cdma_push_wide(&job->channel->cdma, 108 - host1x_opcode_setclass(job->class, 0, 0), 69 + host1x_opcode_setclass(next_class, 0, 0), 109 70 host1x_opcode_setpayload(stream_id), 110 71 host1x_opcode_setstreamid(job->engine_streamid_offset / 4), 111 72 HOST1X_OPCODE_NOP); 112 - #elif HOST1X_HW >= 2 113 - host1x_cdma_push_wide(cdma, 114 - host1x_opcode_setclass( 115 - HOST1X_CLASS_HOST1X, 116 - HOST1X_UCLASS_LOAD_SYNCPT_PAYLOAD_32, 117 - /* WAIT_SYNCPT_32 is at SYNCPT_PAYLOAD_32+2 */ 118 - BIT(0) | BIT(2) 119 - ), 120 - threshold, 121 - id, 122 - host1x_opcode_setclass(next_class, 0, 0) 123 - ); 124 73 #else 125 - /* TODO add waitchk or use waitbases or other mitigation */ 126 - host1x_cdma_push(cdma, 127 - host1x_opcode_setclass( 128 - HOST1X_CLASS_HOST1X, 129 - host1x_uclass_wait_syncpt_r(), 130 - BIT(0) 131 - ), 132 - host1x_class_host_wait_syncpt(id, threshold) 133 - ); 134 74 host1x_cdma_push(cdma, 135 75 host1x_opcode_setclass(next_class, 0, 0), 136 76 HOST1X_OPCODE_NOP ··· 106 110 #endif 107 111 } 108 112 109 - static void submit_gathers(struct host1x_job *job, u32 job_syncpt_base) 113 + static void submit_gathers(struct host1x_job *job, struct host1x_job_cmd *cmds, u32 num_cmds, 114 + u32 job_syncpt_base) 110 115 { 111 116 struct host1x_cdma *cdma = &job->channel->cdma; 112 117 #if HOST1X_HW < 6 ··· 116 119 unsigned int i; 117 120 u32 threshold; 118 121 119 - for (i = 0; i < job->num_cmds; i++) { 120 - struct host1x_job_cmd *cmd = &job->cmds[i]; 122 + for (i = 0; i < num_cmds; i++) { 123 + struct host1x_job_cmd *cmd = &cmds[i]; 121 124 122 125 if (cmd->is_wait) { 123 126 if (cmd->wait.relative) ··· 125 128 else 126 129 threshold = cmd->wait.threshold; 127 130 128 - submit_wait(job, cmd->wait.id, threshold, cmd->wait.next_class); 131 + submit_wait(job, cmd->wait.id, threshold); 132 + submit_setclass(job, cmd->wait.next_class); 129 133 } else { 130 134 struct host1x_job_gather *g = &cmd->gather; 131 135 ··· 214 216 215 217 #if HOST1X_HW >= 6 216 218 u32 fence; 219 + int i = 0; 217 220 221 + if (job->num_cmds == 0) 222 + goto prefences_done; 223 + if (!job->cmds[0].is_wait || job->cmds[0].wait.relative) 224 + goto prefences_done; 225 + 226 + /* Enter host1x class with invalid stream ID for prefence waits. */ 227 + host1x_cdma_push_wide(cdma, 228 + host1x_opcode_acquire_mlock(1), 229 + host1x_opcode_setclass(1, 0, 0), 230 + host1x_opcode_setpayload(0), 231 + host1x_opcode_setstreamid(0x1fffff)); 232 + 233 + for (i = 0; i < job->num_cmds; i++) { 234 + struct host1x_job_cmd *cmd = &job->cmds[i]; 235 + 236 + if (!cmd->is_wait || cmd->wait.relative) 237 + break; 238 + 239 + submit_wait(job, cmd->wait.id, cmd->wait.threshold); 240 + } 241 + 242 + host1x_cdma_push(cdma, 243 + HOST1X_OPCODE_NOP, 244 + host1x_opcode_release_mlock(1)); 245 + 246 + prefences_done: 218 247 /* Enter engine class with invalid stream ID. */ 219 248 host1x_cdma_push_wide(cdma, 220 249 host1x_opcode_acquire_mlock(job->class), ··· 255 230 host1x_opcode_nonincr(HOST1X_UCLASS_INCR_SYNCPT, 1), 256 231 HOST1X_UCLASS_INCR_SYNCPT_INDX_F(job->syncpt->id) | 257 232 HOST1X_UCLASS_INCR_SYNCPT_COND_F(4)); 258 - submit_wait(job, job->syncpt->id, fence, job->class); 233 + submit_wait(job, job->syncpt->id, fence); 234 + submit_setclass(job, job->class); 259 235 260 236 /* Submit work. */ 261 237 job->syncpt_end = host1x_syncpt_incr_max(sp, job->syncpt_incrs); 262 - submit_gathers(job, job->syncpt_end - job->syncpt_incrs); 238 + submit_gathers(job, job->cmds + i, job->num_cmds - i, job->syncpt_end - job->syncpt_incrs); 263 239 264 240 /* Before releasing MLOCK, ensure engine is idle again. */ 265 241 fence = host1x_syncpt_incr_max(sp, 1); ··· 268 242 host1x_opcode_nonincr(HOST1X_UCLASS_INCR_SYNCPT, 1), 269 243 HOST1X_UCLASS_INCR_SYNCPT_INDX_F(job->syncpt->id) | 270 244 HOST1X_UCLASS_INCR_SYNCPT_COND_F(4)); 271 - submit_wait(job, job->syncpt->id, fence, job->class); 245 + submit_wait(job, job->syncpt->id, fence); 272 246 273 247 /* Release MLOCK. */ 274 248 host1x_cdma_push(cdma, ··· 298 272 299 273 job->syncpt_end = host1x_syncpt_incr_max(sp, job->syncpt_incrs); 300 274 301 - submit_gathers(job, job->syncpt_end - job->syncpt_incrs); 275 + submit_gathers(job, job->cmds, job->num_cmds, job->syncpt_end - job->syncpt_incrs); 302 276 #endif 303 277 } 304 278