Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/xe: Avoid reading RMW registers in emit_wa_job

To allow VFs properly handle LRC WAs, we should postpone doing
all RMW register operations and let them be run by the engine
itself, since attempt to perform read registers from within the
driver will fail on the VF. Use MI_MATH and ALU for that.

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Michał Winiarski <michal.winiarski@intel.com>
Cc: Matt Roper <matthew.d.roper@intel.com>
Reviewed-by: Matt Roper <matthew.d.roper@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250303173522.1822-4-michal.wajdeczko@intel.com

+63 -21
+63 -21
drivers/gpu/drm/xe/xe_gt.c
··· 12 12 13 13 #include <generated/xe_wa_oob.h> 14 14 15 + #include "instructions/xe_alu_commands.h" 15 16 #include "instructions/xe_gfxpipe_commands.h" 16 17 #include "instructions/xe_mi_commands.h" 18 + #include "regs/xe_engine_regs.h" 17 19 #include "regs/xe_gt_regs.h" 18 20 #include "xe_assert.h" 19 21 #include "xe_bb.h" ··· 178 176 return 0; 179 177 } 180 178 181 - /* 182 - * Convert back from encoded value to type-safe, only to be used when reg.mcr 183 - * is true 184 - */ 185 - static struct xe_reg_mcr to_xe_reg_mcr(const struct xe_reg reg) 186 - { 187 - return (const struct xe_reg_mcr){.__reg.raw = reg.raw }; 188 - } 189 - 190 179 static int emit_wa_job(struct xe_gt *gt, struct xe_exec_queue *q) 191 180 { 192 181 struct xe_reg_sr *sr = &q->hwe->reg_lrc; ··· 187 194 struct xe_bb *bb; 188 195 struct dma_fence *fence; 189 196 long timeout; 197 + int count_rmw = 0; 190 198 int count = 0; 191 199 192 200 if (q->hwe->class == XE_ENGINE_CLASS_RENDER) ··· 200 206 if (IS_ERR(bb)) 201 207 return PTR_ERR(bb); 202 208 203 - xa_for_each(&sr->xa, idx, entry) 204 - ++count; 209 + /* count RMW registers as those will be handled separately */ 210 + xa_for_each(&sr->xa, idx, entry) { 211 + if (entry->reg.masked || entry->clr_bits == ~0) 212 + ++count; 213 + else 214 + ++count_rmw; 215 + } 216 + 217 + if (count || count_rmw) 218 + xe_gt_dbg(gt, "LRC WA %s save-restore batch\n", sr->name); 205 219 206 220 if (count) { 207 - xe_gt_dbg(gt, "LRC WA %s save-restore batch\n", sr->name); 221 + /* emit single LRI with all non RMW regs */ 208 222 209 223 bb->cs[bb->len++] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count); 210 224 211 225 xa_for_each(&sr->xa, idx, entry) { 212 226 struct xe_reg reg = entry->reg; 213 - struct xe_reg_mcr reg_mcr = to_xe_reg_mcr(reg); 214 227 u32 val; 215 228 216 - /* 217 - * Skip reading the register if it's not really needed 218 - */ 219 229 if (reg.masked) 220 230 val = entry->clr_bits << 16; 221 - else if (entry->clr_bits + 1) 222 - val = (reg.mcr ? 223 - xe_gt_mcr_unicast_read_any(gt, reg_mcr) : 224 - xe_mmio_read32(&gt->mmio, reg)) & (~entry->clr_bits); 225 - else 231 + else if (entry->clr_bits == ~0) 226 232 val = 0; 233 + else 234 + continue; 227 235 228 236 val |= entry->set_bits; 229 237 ··· 233 237 bb->cs[bb->len++] = val; 234 238 xe_gt_dbg(gt, "REG[0x%x] = 0x%08x", reg.addr, val); 235 239 } 240 + } 241 + 242 + if (count_rmw) { 243 + /* emit MI_MATH for each RMW reg */ 244 + 245 + xa_for_each(&sr->xa, idx, entry) { 246 + if (entry->reg.masked || entry->clr_bits == ~0) 247 + continue; 248 + 249 + bb->cs[bb->len++] = MI_LOAD_REGISTER_REG | MI_LRR_DST_CS_MMIO; 250 + bb->cs[bb->len++] = entry->reg.addr; 251 + bb->cs[bb->len++] = CS_GPR_REG(0, 0).addr; 252 + 253 + bb->cs[bb->len++] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(2) | 254 + MI_LRI_LRM_CS_MMIO; 255 + bb->cs[bb->len++] = CS_GPR_REG(0, 1).addr; 256 + bb->cs[bb->len++] = entry->clr_bits; 257 + bb->cs[bb->len++] = CS_GPR_REG(0, 2).addr; 258 + bb->cs[bb->len++] = entry->set_bits; 259 + 260 + bb->cs[bb->len++] = MI_MATH(8); 261 + bb->cs[bb->len++] = CS_ALU_INSTR_LOAD(SRCA, REG0); 262 + bb->cs[bb->len++] = CS_ALU_INSTR_LOADINV(SRCB, REG1); 263 + bb->cs[bb->len++] = CS_ALU_INSTR_AND; 264 + bb->cs[bb->len++] = CS_ALU_INSTR_STORE(REG0, ACCU); 265 + bb->cs[bb->len++] = CS_ALU_INSTR_LOAD(SRCA, REG0); 266 + bb->cs[bb->len++] = CS_ALU_INSTR_LOAD(SRCB, REG2); 267 + bb->cs[bb->len++] = CS_ALU_INSTR_OR; 268 + bb->cs[bb->len++] = CS_ALU_INSTR_STORE(REG0, ACCU); 269 + 270 + bb->cs[bb->len++] = MI_LOAD_REGISTER_REG | MI_LRR_SRC_CS_MMIO; 271 + bb->cs[bb->len++] = CS_GPR_REG(0, 0).addr; 272 + bb->cs[bb->len++] = entry->reg.addr; 273 + 274 + xe_gt_dbg(gt, "REG[%#x] = ~%#x|%#x\n", 275 + entry->reg.addr, entry->clr_bits, entry->set_bits); 276 + } 277 + 278 + /* reset used GPR */ 279 + bb->cs[bb->len++] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(3) | MI_LRI_LRM_CS_MMIO; 280 + bb->cs[bb->len++] = CS_GPR_REG(0, 0).addr; 281 + bb->cs[bb->len++] = 0; 282 + bb->cs[bb->len++] = CS_GPR_REG(0, 1).addr; 283 + bb->cs[bb->len++] = 0; 284 + bb->cs[bb->len++] = CS_GPR_REG(0, 2).addr; 285 + bb->cs[bb->len++] = 0; 236 286 } 237 287 238 288 xe_lrc_emit_hwe_state_instructions(q, bb);