Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/xe: Convert existing drm_exec transactions for exhaustive eviction

Convert existing drm_exec transactions, like GT pagefault validation,
non-LR exec() IOCTL and the rebind worker to support
exhaustive eviction using the xe_validation_guard().

v2:
- Adapt to signature change in xe_validation_guard() (Matt Brost)
- Avoid gotos from within xe_validation_guard() (Matt Brost)
- Check error return from xe_validation_guard()

v3:
- Rebase on gpu_madvise()

Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com> #v1
Link: https://lore.kernel.org/r/20250908101246.65025-6-thomas.hellstrom@linux.intel.com

+75 -106
+8 -12
drivers/gpu/drm/xe/xe_exec.c
··· 120 120 struct drm_gpuvm_exec vm_exec = {.extra.fn = xe_exec_fn}; 121 121 struct drm_exec *exec = &vm_exec.exec; 122 122 u32 i, num_syncs, num_ufence = 0; 123 + struct xe_validation_ctx ctx; 123 124 struct xe_sched_job *job; 124 125 struct xe_vm *vm; 125 126 bool write_locked, skip_retry = false; 126 - ktime_t end = 0; 127 127 int err = 0; 128 128 struct xe_hw_engine_group *group; 129 129 enum xe_hw_engine_group_execution_mode mode, previous_mode; ··· 251 251 if (err) 252 252 goto err_unlock_list; 253 253 254 - vm_exec.vm = &vm->gpuvm; 255 - vm_exec.flags = DRM_EXEC_INTERRUPTIBLE_WAIT; 256 - if (xe_vm_in_lr_mode(vm)) { 257 - drm_exec_init(exec, vm_exec.flags, 0); 258 - } else { 259 - err = drm_gpuvm_exec_lock(&vm_exec); 260 - if (err) { 261 - if (xe_vm_validate_should_retry(exec, err, &end)) 262 - err = -EAGAIN; 254 + if (!xe_vm_in_lr_mode(vm)) { 255 + vm_exec.vm = &vm->gpuvm; 256 + vm_exec.flags = DRM_EXEC_INTERRUPTIBLE_WAIT; 257 + err = xe_validation_exec_lock(&ctx, &vm_exec, &xe->val); 258 + if (err) 263 259 goto err_unlock_list; 264 - } 265 260 } 266 261 267 262 if (xe_vm_is_closed_or_banned(q->vm)) { ··· 350 355 if (err) 351 356 xe_sched_job_put(job); 352 357 err_exec: 353 - drm_exec_fini(exec); 358 + if (!xe_vm_in_lr_mode(vm)) 359 + xe_validation_ctx_fini(&ctx); 354 360 err_unlock_list: 355 361 up_read(&vm->lock); 356 362 if (err == -EAGAIN && !skip_retry)
+9 -11
drivers/gpu/drm/xe/xe_gt_pagefault.c
··· 96 96 { 97 97 struct xe_vm *vm = xe_vma_vm(vma); 98 98 struct xe_tile *tile = gt_to_tile(gt); 99 + struct xe_validation_ctx ctx; 99 100 struct drm_exec exec; 100 101 struct dma_fence *fence; 101 - ktime_t end = 0; 102 102 int err, needs_vram; 103 103 104 104 lockdep_assert_held_write(&vm->lock); ··· 127 127 } 128 128 129 129 /* Lock VM and BOs dma-resv */ 130 - drm_exec_init(&exec, 0, 0); 130 + xe_validation_ctx_init(&ctx, &vm->xe->val, &exec, (struct xe_val_flags) {}); 131 131 drm_exec_until_all_locked(&exec) { 132 132 err = xe_pf_begin(&exec, vma, needs_vram == 1, tile->mem.vram); 133 133 drm_exec_retry_on_contention(&exec); 134 - if (xe_vm_validate_should_retry(&exec, err, &end)) 135 - err = -EAGAIN; 134 + xe_validation_retry_on_oom(&ctx, &err); 136 135 if (err) 137 136 goto unlock_dma_resv; 138 137 ··· 142 143 xe_vm_set_validation_exec(vm, NULL); 143 144 if (IS_ERR(fence)) { 144 145 err = PTR_ERR(fence); 145 - if (xe_vm_validate_should_retry(&exec, err, &end)) 146 - err = -EAGAIN; 146 + xe_validation_retry_on_oom(&ctx, &err); 147 147 goto unlock_dma_resv; 148 148 } 149 149 } ··· 151 153 dma_fence_put(fence); 152 154 153 155 unlock_dma_resv: 154 - drm_exec_fini(&exec); 156 + xe_validation_ctx_fini(&ctx); 155 157 if (err == -EAGAIN) 156 158 goto retry_userptr; 157 159 ··· 533 535 { 534 536 struct xe_device *xe = gt_to_xe(gt); 535 537 struct xe_tile *tile = gt_to_tile(gt); 538 + struct xe_validation_ctx ctx; 536 539 struct drm_exec exec; 537 540 struct xe_vm *vm; 538 541 struct xe_vma *vma; ··· 563 564 goto unlock_vm; 564 565 565 566 /* Lock VM and BOs dma-resv */ 566 - drm_exec_init(&exec, 0, 0); 567 + xe_validation_ctx_init(&ctx, &vm->xe->val, &exec, (struct xe_val_flags) {}); 567 568 drm_exec_until_all_locked(&exec) { 568 569 ret = xe_pf_begin(&exec, vma, IS_DGFX(vm->xe), tile->mem.vram); 569 570 drm_exec_retry_on_contention(&exec); 570 - if (ret) 571 - break; 571 + xe_validation_retry_on_oom(&ctx, &ret); 572 572 } 573 573 574 - drm_exec_fini(&exec); 574 + xe_validation_ctx_fini(&ctx); 575 575 unlock_vm: 576 576 up_read(&vm->lock); 577 577 xe_vm_put(vm);
+58 -81
drivers/gpu/drm/xe/xe_vm.c
··· 210 210 .num_fences = 1, 211 211 }; 212 212 struct drm_exec *exec = &vm_exec.exec; 213 + struct xe_validation_ctx ctx; 213 214 struct dma_fence *pfence; 214 215 int err; 215 216 bool wait; ··· 218 217 xe_assert(vm->xe, xe_vm_in_preempt_fence_mode(vm)); 219 218 220 219 down_write(&vm->lock); 221 - err = drm_gpuvm_exec_lock(&vm_exec); 220 + err = xe_validation_exec_lock(&ctx, &vm_exec, &vm->xe->val); 222 221 if (err) 223 222 goto out_up_write; 224 223 ··· 250 249 xe_svm_notifier_unlock(vm); 251 250 252 251 out_fini: 253 - drm_exec_fini(exec); 252 + xe_validation_ctx_fini(&ctx); 254 253 out_up_write: 255 254 up_write(&vm->lock); 256 255 ··· 312 311 xe_vm_unlock(vm); 313 312 314 313 /* TODO: Inform user the VM is banned */ 315 - } 316 - 317 - /** 318 - * xe_vm_validate_should_retry() - Whether to retry after a validate error. 319 - * @exec: The drm_exec object used for locking before validation. 320 - * @err: The error returned from ttm_bo_validate(). 321 - * @end: A ktime_t cookie that should be set to 0 before first use and 322 - * that should be reused on subsequent calls. 323 - * 324 - * With multiple active VMs, under memory pressure, it is possible that 325 - * ttm_bo_validate() run into -EDEADLK and in such case returns -ENOMEM. 326 - * Until ttm properly handles locking in such scenarios, best thing the 327 - * driver can do is retry with a timeout. Check if that is necessary, and 328 - * if so unlock the drm_exec's objects while keeping the ticket to prepare 329 - * for a rerun. 330 - * 331 - * Return: true if a retry after drm_exec_init() is recommended; 332 - * false otherwise. 333 - */ 334 - bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, ktime_t *end) 335 - { 336 - ktime_t cur; 337 - 338 - if (err != -ENOMEM) 339 - return false; 340 - 341 - cur = ktime_get(); 342 - *end = *end ? : ktime_add_ms(cur, XE_VM_REBIND_RETRY_TIMEOUT_MS); 343 - if (!ktime_before(cur, *end)) 344 - return false; 345 - 346 - msleep(20); 347 - return true; 348 314 } 349 315 350 316 static int xe_gpuvm_validate(struct drm_gpuvm_bo *vm_bo, struct drm_exec *exec) ··· 444 476 static void preempt_rebind_work_func(struct work_struct *w) 445 477 { 446 478 struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work); 479 + struct xe_validation_ctx ctx; 447 480 struct drm_exec exec; 448 481 unsigned int fence_count = 0; 449 482 LIST_HEAD(preempt_fences); 450 - ktime_t end = 0; 451 483 int err = 0; 452 484 long wait; 453 485 int __maybe_unused tries = 0; ··· 475 507 goto out_unlock_outer; 476 508 } 477 509 478 - drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0); 510 + err = xe_validation_ctx_init(&ctx, &vm->xe->val, &exec, 511 + (struct xe_val_flags) {.interruptible = true}); 512 + if (err) 513 + goto out_unlock_outer; 479 514 480 515 drm_exec_until_all_locked(&exec) { 481 516 bool done = false; 482 517 483 518 err = xe_preempt_work_begin(&exec, vm, &done); 484 519 drm_exec_retry_on_contention(&exec); 520 + xe_validation_retry_on_oom(&ctx, &err); 485 521 if (err || done) { 486 - drm_exec_fini(&exec); 487 - if (err && xe_vm_validate_should_retry(&exec, err, &end)) 488 - err = -EAGAIN; 489 - 522 + xe_validation_ctx_fini(&ctx); 490 523 goto out_unlock_outer; 491 524 } 492 525 } ··· 535 566 xe_svm_notifier_unlock(vm); 536 567 537 568 out_unlock: 538 - drm_exec_fini(&exec); 569 + xe_validation_ctx_fini(&ctx); 539 570 out_unlock_outer: 540 571 if (err == -EAGAIN) { 541 572 trace_xe_vm_rebind_worker_retry(vm); ··· 1133 1164 1134 1165 static void xe_vma_destroy_unlocked(struct xe_vma *vma) 1135 1166 { 1167 + struct xe_device *xe = xe_vma_vm(vma)->xe; 1168 + struct xe_validation_ctx ctx; 1136 1169 struct drm_exec exec; 1137 - int err; 1170 + int err = 0; 1138 1171 1139 - drm_exec_init(&exec, 0, 0); 1140 - drm_exec_until_all_locked(&exec) { 1172 + xe_validation_guard(&ctx, &xe->val, &exec, (struct xe_val_flags) {}, err) { 1141 1173 err = xe_vm_lock_vma(&exec, vma); 1142 1174 drm_exec_retry_on_contention(&exec); 1143 1175 if (XE_WARN_ON(err)) 1144 1176 break; 1177 + xe_vma_destroy(vma, NULL); 1145 1178 } 1146 - 1147 - xe_vma_destroy(vma, NULL); 1148 - 1149 - drm_exec_fini(&exec); 1179 + xe_assert(xe, !err); 1150 1180 } 1151 1181 1152 1182 struct xe_vma * ··· 2351 2383 struct xe_vma_mem_attr *attr, unsigned int flags) 2352 2384 { 2353 2385 struct xe_bo *bo = op->gem.obj ? gem_to_xe_bo(op->gem.obj) : NULL; 2386 + struct xe_validation_ctx ctx; 2354 2387 struct drm_exec exec; 2355 2388 struct xe_vma *vma; 2356 2389 int err = 0; ··· 2359 2390 lockdep_assert_held_write(&vm->lock); 2360 2391 2361 2392 if (bo) { 2362 - drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0); 2363 - drm_exec_until_all_locked(&exec) { 2364 - err = 0; 2393 + err = 0; 2394 + xe_validation_guard(&ctx, &vm->xe->val, &exec, 2395 + (struct xe_val_flags) {.interruptible = true}, err) { 2365 2396 if (!bo->vm) { 2366 2397 err = drm_exec_lock_obj(&exec, xe_vm_obj(vm)); 2367 2398 drm_exec_retry_on_contention(&exec); ··· 2370 2401 err = drm_exec_lock_obj(&exec, &bo->ttm.base); 2371 2402 drm_exec_retry_on_contention(&exec); 2372 2403 } 2373 - if (err) { 2374 - drm_exec_fini(&exec); 2404 + if (err) 2375 2405 return ERR_PTR(err); 2406 + 2407 + vma = xe_vma_create(vm, bo, op->gem.offset, 2408 + op->va.addr, op->va.addr + 2409 + op->va.range - 1, attr, flags); 2410 + if (IS_ERR(vma)) 2411 + return vma; 2412 + 2413 + if (!bo->vm) { 2414 + err = add_preempt_fences(vm, bo); 2415 + if (err) { 2416 + prep_vma_destroy(vm, vma, false); 2417 + xe_vma_destroy(vma, NULL); 2418 + } 2376 2419 } 2377 2420 } 2421 + if (err) 2422 + return ERR_PTR(err); 2423 + } else { 2424 + vma = xe_vma_create(vm, NULL, op->gem.offset, 2425 + op->va.addr, op->va.addr + 2426 + op->va.range - 1, attr, flags); 2427 + if (IS_ERR(vma)) 2428 + return vma; 2429 + 2430 + if (xe_vma_is_userptr(vma)) 2431 + err = xe_vma_userptr_pin_pages(to_userptr_vma(vma)); 2378 2432 } 2379 - vma = xe_vma_create(vm, bo, op->gem.offset, 2380 - op->va.addr, op->va.addr + 2381 - op->va.range - 1, attr, flags); 2382 - if (IS_ERR(vma)) 2383 - goto err_unlock; 2384 - 2385 - if (xe_vma_is_userptr(vma)) 2386 - err = xe_vma_userptr_pin_pages(to_userptr_vma(vma)); 2387 - else if (!xe_vma_has_no_bo(vma) && !bo->vm) 2388 - err = add_preempt_fences(vm, bo); 2389 - 2390 - err_unlock: 2391 - if (bo) 2392 - drm_exec_fini(&exec); 2393 - 2394 2433 if (err) { 2395 2434 prep_vma_destroy(vm, vma, false); 2396 2435 xe_vma_destroy_unlocked(vma); ··· 3197 3220 static struct dma_fence *vm_bind_ioctl_ops_execute(struct xe_vm *vm, 3198 3221 struct xe_vma_ops *vops) 3199 3222 { 3223 + struct xe_validation_ctx ctx; 3200 3224 struct drm_exec exec; 3201 3225 struct dma_fence *fence; 3202 - int err; 3226 + int err = 0; 3203 3227 3204 3228 lockdep_assert_held_write(&vm->lock); 3205 3229 3206 - drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT | 3207 - DRM_EXEC_IGNORE_DUPLICATES, 0); 3208 - drm_exec_until_all_locked(&exec) { 3230 + xe_validation_guard(&ctx, &vm->xe->val, &exec, 3231 + ((struct xe_val_flags) { 3232 + .interruptible = true, 3233 + .exec_ignore_duplicates = true, 3234 + }), err) { 3209 3235 err = vm_bind_ioctl_ops_lock_and_prep(&exec, vm, vops); 3210 3236 drm_exec_retry_on_contention(&exec); 3211 - if (err) { 3212 - fence = ERR_PTR(err); 3213 - goto unlock; 3214 - } 3237 + xe_validation_retry_on_oom(&ctx, &err); 3238 + if (err) 3239 + return ERR_PTR(err); 3215 3240 3216 3241 xe_vm_set_validation_exec(vm, &exec); 3217 3242 fence = ops_execute(vm, vops); ··· 3221 3242 if (IS_ERR(fence)) { 3222 3243 if (PTR_ERR(fence) == -ENODATA) 3223 3244 vm_bind_ioctl_ops_fini(vm, vops, NULL); 3224 - goto unlock; 3245 + return fence; 3225 3246 } 3226 3247 3227 3248 vm_bind_ioctl_ops_fini(vm, vops, fence); 3228 3249 } 3229 3250 3230 - unlock: 3231 - drm_exec_fini(&exec); 3232 - return fence; 3251 + return err ? ERR_PTR(err) : fence; 3233 3252 } 3234 3253 ALLOW_ERROR_INJECTION(vm_bind_ioctl_ops_execute, ERRNO); 3235 3254
-2
drivers/gpu/drm/xe/xe_vm.h
··· 260 260 } 261 261 } 262 262 263 - bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, ktime_t *end); 264 - 265 263 int xe_vm_lock_vma(struct drm_exec *exec, struct xe_vma *vma); 266 264 267 265 int xe_vm_validate_rebind(struct xe_vm *vm, struct drm_exec *exec,