Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdgpu: partially revert "revert to old status lock handling v3"

The CI systems are pointing out list corruptions, so we still need to
fix something here.

Keep the asserts, but revert the lock changes for now.

Fixes: 59e4405e9ee2 ("drm/amdgpu: revert to old status lock handling v3")
Signed-off-by: Christian König <christian.koenig@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Christian König and committed by
Alex Deucher
a107aeb6 ddbfac15

+105 -68
+4 -4
drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
··· 726 726 struct amdgpu_bo *bo; 727 727 int ret; 728 728 729 - spin_lock(&vm->invalidated_lock); 729 + spin_lock(&vm->status_lock); 730 730 while (!list_empty(&vm->invalidated)) { 731 731 bo_va = list_first_entry(&vm->invalidated, 732 732 struct amdgpu_bo_va, 733 733 base.vm_status); 734 - spin_unlock(&vm->invalidated_lock); 734 + spin_unlock(&vm->status_lock); 735 735 736 736 bo = bo_va->base.bo; 737 737 ret = drm_exec_prepare_obj(exec, &bo->tbo.base, 2); ··· 748 748 if (ret) 749 749 return ret; 750 750 751 - spin_lock(&vm->invalidated_lock); 751 + spin_lock(&vm->status_lock); 752 752 } 753 - spin_unlock(&vm->invalidated_lock); 753 + spin_unlock(&vm->status_lock); 754 754 755 755 return 0; 756 756 }
+91 -55
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
··· 153 153 154 154 vm_bo->moved = true; 155 155 amdgpu_vm_assert_locked(vm); 156 + spin_lock(&vm_bo->vm->status_lock); 156 157 if (bo->tbo.type == ttm_bo_type_kernel) 157 158 list_move(&vm_bo->vm_status, &vm->evicted); 158 159 else 159 160 list_move_tail(&vm_bo->vm_status, &vm->evicted); 161 + spin_unlock(&vm_bo->vm->status_lock); 160 162 } 161 163 /** 162 164 * amdgpu_vm_bo_moved - vm_bo is moved ··· 171 169 static void amdgpu_vm_bo_moved(struct amdgpu_vm_bo_base *vm_bo) 172 170 { 173 171 amdgpu_vm_assert_locked(vm_bo->vm); 172 + spin_lock(&vm_bo->vm->status_lock); 174 173 list_move(&vm_bo->vm_status, &vm_bo->vm->moved); 174 + spin_unlock(&vm_bo->vm->status_lock); 175 175 } 176 176 177 177 /** ··· 187 183 static void amdgpu_vm_bo_idle(struct amdgpu_vm_bo_base *vm_bo) 188 184 { 189 185 amdgpu_vm_assert_locked(vm_bo->vm); 186 + spin_lock(&vm_bo->vm->status_lock); 190 187 list_move(&vm_bo->vm_status, &vm_bo->vm->idle); 188 + spin_unlock(&vm_bo->vm->status_lock); 191 189 vm_bo->moved = false; 192 190 } 193 191 ··· 203 197 */ 204 198 static void amdgpu_vm_bo_invalidated(struct amdgpu_vm_bo_base *vm_bo) 205 199 { 206 - spin_lock(&vm_bo->vm->invalidated_lock); 200 + spin_lock(&vm_bo->vm->status_lock); 207 201 list_move(&vm_bo->vm_status, &vm_bo->vm->invalidated); 208 - spin_unlock(&vm_bo->vm->invalidated_lock); 202 + spin_unlock(&vm_bo->vm->status_lock); 209 203 } 210 204 211 205 /** ··· 218 212 */ 219 213 static void amdgpu_vm_bo_evicted_user(struct amdgpu_vm_bo_base *vm_bo) 220 214 { 221 - amdgpu_vm_assert_locked(vm_bo->vm); 222 215 vm_bo->moved = true; 216 + spin_lock(&vm_bo->vm->status_lock); 223 217 list_move(&vm_bo->vm_status, &vm_bo->vm->evicted_user); 218 + spin_unlock(&vm_bo->vm->status_lock); 224 219 } 225 220 226 221 /** ··· 235 228 static void amdgpu_vm_bo_relocated(struct amdgpu_vm_bo_base *vm_bo) 236 229 { 237 230 amdgpu_vm_assert_locked(vm_bo->vm); 238 - if (vm_bo->bo->parent) 231 + if (vm_bo->bo->parent) { 232 + spin_lock(&vm_bo->vm->status_lock); 239 233 list_move(&vm_bo->vm_status, &vm_bo->vm->relocated); 240 - else 234 + spin_unlock(&vm_bo->vm->status_lock); 235 + } else { 241 236 amdgpu_vm_bo_idle(vm_bo); 237 + } 242 238 } 243 239 244 240 /** ··· 255 245 static void amdgpu_vm_bo_done(struct amdgpu_vm_bo_base *vm_bo) 256 246 { 257 247 amdgpu_vm_assert_locked(vm_bo->vm); 248 + spin_lock(&vm_bo->vm->status_lock); 258 249 list_move(&vm_bo->vm_status, &vm_bo->vm->done); 250 + spin_unlock(&vm_bo->vm->status_lock); 259 251 } 260 252 261 253 /** ··· 271 259 { 272 260 struct amdgpu_vm_bo_base *vm_bo, *tmp; 273 261 274 - spin_lock(&vm->invalidated_lock); 262 + amdgpu_vm_assert_locked(vm); 263 + 264 + spin_lock(&vm->status_lock); 275 265 list_splice_init(&vm->done, &vm->invalidated); 276 266 list_for_each_entry(vm_bo, &vm->invalidated, vm_status) 277 267 vm_bo->moved = true; 278 - spin_unlock(&vm->invalidated_lock); 279 268 280 - amdgpu_vm_assert_locked(vm); 281 269 list_for_each_entry_safe(vm_bo, tmp, &vm->idle, vm_status) { 282 270 struct amdgpu_bo *bo = vm_bo->bo; 283 271 ··· 287 275 else if (bo->parent) 288 276 list_move(&vm_bo->vm_status, &vm_bo->vm->relocated); 289 277 } 278 + spin_unlock(&vm->status_lock); 290 279 } 291 280 292 281 /** 293 282 * amdgpu_vm_update_shared - helper to update shared memory stat 294 283 * @base: base structure for tracking BO usage in a VM 295 284 * 296 - * Takes the vm stats_lock and updates the shared memory stat. If the basic 285 + * Takes the vm status_lock and updates the shared memory stat. If the basic 297 286 * stat changed (e.g. buffer was moved) amdgpu_vm_update_stats need to be called 298 287 * as well. 299 288 */ ··· 307 294 bool shared; 308 295 309 296 dma_resv_assert_held(bo->tbo.base.resv); 310 - spin_lock(&vm->stats_lock); 297 + spin_lock(&vm->status_lock); 311 298 shared = drm_gem_object_is_shared_for_memory_stats(&bo->tbo.base); 312 299 if (base->shared != shared) { 313 300 base->shared = shared; ··· 319 306 vm->stats[bo_memtype].drm.private += size; 320 307 } 321 308 } 322 - spin_unlock(&vm->stats_lock); 309 + spin_unlock(&vm->status_lock); 323 310 } 324 311 325 312 /** ··· 344 331 * be bo->tbo.resource 345 332 * @sign: if we should add (+1) or subtract (-1) from the stat 346 333 * 347 - * Caller need to have the vm stats_lock held. Useful for when multiple update 334 + * Caller need to have the vm status_lock held. Useful for when multiple update 348 335 * need to happen at the same time. 349 336 */ 350 337 static void amdgpu_vm_update_stats_locked(struct amdgpu_vm_bo_base *base, 351 - struct ttm_resource *res, int sign) 338 + struct ttm_resource *res, int sign) 352 339 { 353 340 struct amdgpu_vm *vm = base->vm; 354 341 struct amdgpu_bo *bo = base->bo; ··· 372 359 */ 373 360 if (bo->flags & AMDGPU_GEM_CREATE_DISCARDABLE) 374 361 vm->stats[res_memtype].drm.purgeable += size; 375 - if (!(bo->preferred_domains & 376 - amdgpu_mem_type_to_domain(res_memtype))) 362 + if (!(bo->preferred_domains & amdgpu_mem_type_to_domain(res_memtype))) 377 363 vm->stats[bo_memtype].evicted += size; 378 364 } 379 365 } ··· 391 379 { 392 380 struct amdgpu_vm *vm = base->vm; 393 381 394 - spin_lock(&vm->stats_lock); 382 + spin_lock(&vm->status_lock); 395 383 amdgpu_vm_update_stats_locked(base, res, sign); 396 - spin_unlock(&vm->stats_lock); 384 + spin_unlock(&vm->status_lock); 397 385 } 398 386 399 387 /** ··· 419 407 base->next = bo->vm_bo; 420 408 bo->vm_bo = base; 421 409 422 - spin_lock(&vm->stats_lock); 410 + spin_lock(&vm->status_lock); 423 411 base->shared = drm_gem_object_is_shared_for_memory_stats(&bo->tbo.base); 424 412 amdgpu_vm_update_stats_locked(base, bo->tbo.resource, +1); 425 - spin_unlock(&vm->stats_lock); 413 + spin_unlock(&vm->status_lock); 426 414 427 415 if (!amdgpu_vm_is_bo_always_valid(vm, bo)) 428 416 return; ··· 481 469 int ret; 482 470 483 471 /* We can only trust prev->next while holding the lock */ 484 - spin_lock(&vm->invalidated_lock); 472 + spin_lock(&vm->status_lock); 485 473 while (!list_is_head(prev->next, &vm->done)) { 486 474 bo_va = list_entry(prev->next, typeof(*bo_va), base.vm_status); 487 - spin_unlock(&vm->invalidated_lock); 475 + spin_unlock(&vm->status_lock); 488 476 489 477 bo = bo_va->base.bo; 490 478 if (bo) { ··· 492 480 if (unlikely(ret)) 493 481 return ret; 494 482 } 495 - spin_lock(&vm->invalidated_lock); 483 + spin_lock(&vm->status_lock); 496 484 prev = prev->next; 497 485 } 498 - spin_unlock(&vm->invalidated_lock); 486 + spin_unlock(&vm->status_lock); 499 487 500 488 return 0; 501 489 } ··· 591 579 void *param) 592 580 { 593 581 uint64_t new_vm_generation = amdgpu_vm_generation(adev, vm); 594 - struct amdgpu_vm_bo_base *bo_base, *tmp; 582 + struct amdgpu_vm_bo_base *bo_base; 595 583 struct amdgpu_bo *bo; 596 584 int r; 597 585 ··· 604 592 return r; 605 593 } 606 594 607 - list_for_each_entry_safe(bo_base, tmp, &vm->evicted, vm_status) { 595 + spin_lock(&vm->status_lock); 596 + while (!list_empty(&vm->evicted)) { 597 + bo_base = list_first_entry(&vm->evicted, 598 + struct amdgpu_vm_bo_base, 599 + vm_status); 600 + spin_unlock(&vm->status_lock); 601 + 608 602 bo = bo_base->bo; 609 603 610 604 r = validate(param, bo); ··· 623 605 vm->update_funcs->map_table(to_amdgpu_bo_vm(bo)); 624 606 amdgpu_vm_bo_relocated(bo_base); 625 607 } 608 + spin_lock(&vm->status_lock); 626 609 } 610 + while (ticket && !list_empty(&vm->evicted_user)) { 611 + bo_base = list_first_entry(&vm->evicted_user, 612 + struct amdgpu_vm_bo_base, 613 + vm_status); 614 + spin_unlock(&vm->status_lock); 627 615 628 - if (ticket) { 629 - list_for_each_entry_safe(bo_base, tmp, &vm->evicted_user, 630 - vm_status) { 631 - bo = bo_base->bo; 632 - dma_resv_assert_held(bo->tbo.base.resv); 616 + bo = bo_base->bo; 617 + dma_resv_assert_held(bo->tbo.base.resv); 633 618 634 - r = validate(param, bo); 635 - if (r) 636 - return r; 619 + r = validate(param, bo); 620 + if (r) 621 + return r; 637 622 638 - amdgpu_vm_bo_invalidated(bo_base); 639 - } 623 + amdgpu_vm_bo_invalidated(bo_base); 624 + 625 + spin_lock(&vm->status_lock); 640 626 } 627 + spin_unlock(&vm->status_lock); 641 628 642 629 amdgpu_vm_eviction_lock(vm); 643 630 vm->evicting = false; ··· 671 648 ret = !vm->evicting; 672 649 amdgpu_vm_eviction_unlock(vm); 673 650 651 + spin_lock(&vm->status_lock); 674 652 ret &= list_empty(&vm->evicted); 653 + spin_unlock(&vm->status_lock); 675 654 676 655 spin_lock(&vm->immediate.lock); 677 656 ret &= !vm->immediate.stopped; ··· 964 939 struct amdgpu_vm *vm, bool immediate) 965 940 { 966 941 struct amdgpu_vm_update_params params; 967 - struct amdgpu_vm_bo_base *entry, *tmp; 942 + struct amdgpu_vm_bo_base *entry; 968 943 bool flush_tlb_needed = false; 944 + LIST_HEAD(relocated); 969 945 int r, idx; 970 946 971 947 amdgpu_vm_assert_locked(vm); 972 948 973 - if (list_empty(&vm->relocated)) 949 + spin_lock(&vm->status_lock); 950 + list_splice_init(&vm->relocated, &relocated); 951 + spin_unlock(&vm->status_lock); 952 + 953 + if (list_empty(&relocated)) 974 954 return 0; 975 955 976 956 if (!drm_dev_enter(adev_to_drm(adev), &idx)) ··· 991 961 if (r) 992 962 goto error; 993 963 994 - list_for_each_entry(entry, &vm->relocated, vm_status) { 964 + list_for_each_entry(entry, &relocated, vm_status) { 995 965 /* vm_flush_needed after updating moved PDEs */ 996 966 flush_tlb_needed |= entry->moved; 997 967 ··· 1007 977 if (flush_tlb_needed) 1008 978 atomic64_inc(&vm->tlb_seq); 1009 979 1010 - list_for_each_entry_safe(entry, tmp, &vm->relocated, vm_status) { 980 + while (!list_empty(&relocated)) { 981 + entry = list_first_entry(&relocated, struct amdgpu_vm_bo_base, 982 + vm_status); 1011 983 amdgpu_vm_bo_idle(entry); 1012 984 } 1013 985 ··· 1236 1204 void amdgpu_vm_get_memory(struct amdgpu_vm *vm, 1237 1205 struct amdgpu_mem_stats stats[__AMDGPU_PL_NUM]) 1238 1206 { 1239 - spin_lock(&vm->stats_lock); 1207 + spin_lock(&vm->status_lock); 1240 1208 memcpy(stats, vm->stats, sizeof(*stats) * __AMDGPU_PL_NUM); 1241 - spin_unlock(&vm->stats_lock); 1209 + spin_unlock(&vm->status_lock); 1242 1210 } 1243 1211 1244 1212 /** ··· 1605 1573 struct amdgpu_vm *vm, 1606 1574 struct ww_acquire_ctx *ticket) 1607 1575 { 1608 - struct amdgpu_bo_va *bo_va, *tmp; 1576 + struct amdgpu_bo_va *bo_va; 1609 1577 struct dma_resv *resv; 1610 1578 bool clear, unlock; 1611 1579 int r; 1612 1580 1613 - list_for_each_entry_safe(bo_va, tmp, &vm->moved, base.vm_status) { 1581 + spin_lock(&vm->status_lock); 1582 + while (!list_empty(&vm->moved)) { 1583 + bo_va = list_first_entry(&vm->moved, struct amdgpu_bo_va, 1584 + base.vm_status); 1585 + spin_unlock(&vm->status_lock); 1586 + 1614 1587 /* Per VM BOs never need to bo cleared in the page tables */ 1615 1588 r = amdgpu_vm_bo_update(adev, bo_va, false); 1616 1589 if (r) 1617 1590 return r; 1591 + spin_lock(&vm->status_lock); 1618 1592 } 1619 1593 1620 - spin_lock(&vm->invalidated_lock); 1621 1594 while (!list_empty(&vm->invalidated)) { 1622 1595 bo_va = list_first_entry(&vm->invalidated, struct amdgpu_bo_va, 1623 1596 base.vm_status); 1624 1597 resv = bo_va->base.bo->tbo.base.resv; 1625 - spin_unlock(&vm->invalidated_lock); 1598 + spin_unlock(&vm->status_lock); 1626 1599 1627 1600 /* Try to reserve the BO to avoid clearing its ptes */ 1628 1601 if (!adev->debug_vm && dma_resv_trylock(resv)) { ··· 1659 1622 bo_va->base.bo->tbo.resource->mem_type == TTM_PL_SYSTEM)) 1660 1623 amdgpu_vm_bo_evicted_user(&bo_va->base); 1661 1624 1662 - spin_lock(&vm->invalidated_lock); 1625 + spin_lock(&vm->status_lock); 1663 1626 } 1664 - spin_unlock(&vm->invalidated_lock); 1627 + spin_unlock(&vm->status_lock); 1665 1628 1666 1629 return 0; 1667 1630 } ··· 2190 2153 } 2191 2154 } 2192 2155 2193 - spin_lock(&vm->invalidated_lock); 2156 + spin_lock(&vm->status_lock); 2194 2157 list_del(&bo_va->base.vm_status); 2195 - spin_unlock(&vm->invalidated_lock); 2158 + spin_unlock(&vm->status_lock); 2196 2159 2197 2160 list_for_each_entry_safe(mapping, next, &bo_va->valids, list) { 2198 2161 list_del(&mapping->list); ··· 2300 2263 for (bo_base = bo->vm_bo; bo_base; bo_base = bo_base->next) { 2301 2264 struct amdgpu_vm *vm = bo_base->vm; 2302 2265 2303 - spin_lock(&vm->stats_lock); 2266 + spin_lock(&vm->status_lock); 2304 2267 amdgpu_vm_update_stats_locked(bo_base, bo->tbo.resource, -1); 2305 2268 amdgpu_vm_update_stats_locked(bo_base, new_mem, +1); 2306 - spin_unlock(&vm->stats_lock); 2269 + spin_unlock(&vm->status_lock); 2307 2270 } 2308 2271 2309 2272 amdgpu_vm_bo_invalidate(bo, evicted); ··· 2571 2534 INIT_LIST_HEAD(&vm->relocated); 2572 2535 INIT_LIST_HEAD(&vm->moved); 2573 2536 INIT_LIST_HEAD(&vm->idle); 2574 - spin_lock_init(&vm->invalidated_lock); 2575 2537 INIT_LIST_HEAD(&vm->invalidated); 2538 + spin_lock_init(&vm->status_lock); 2576 2539 INIT_LIST_HEAD(&vm->freed); 2577 2540 INIT_LIST_HEAD(&vm->done); 2578 2541 INIT_KFIFO(vm->faults); 2579 - spin_lock_init(&vm->stats_lock); 2580 2542 2581 2543 r = amdgpu_vm_init_entities(adev, vm); 2582 2544 if (r) ··· 3051 3015 3052 3016 amdgpu_vm_assert_locked(vm); 3053 3017 3018 + spin_lock(&vm->status_lock); 3054 3019 seq_puts(m, "\tIdle BOs:\n"); 3055 3020 list_for_each_entry_safe(bo_va, tmp, &vm->idle, base.vm_status) { 3056 3021 if (!bo_va->base.bo) ··· 3089 3052 id = 0; 3090 3053 3091 3054 seq_puts(m, "\tInvalidated BOs:\n"); 3092 - spin_lock(&vm->invalidated_lock); 3093 3055 list_for_each_entry_safe(bo_va, tmp, &vm->invalidated, base.vm_status) { 3094 3056 if (!bo_va->base.bo) 3095 3057 continue; 3096 3058 total_invalidated += amdgpu_bo_print_info(id++, bo_va->base.bo, m); 3097 3059 } 3098 - spin_unlock(&vm->invalidated_lock); 3099 3060 total_invalidated_objs = id; 3100 3061 id = 0; 3101 3062 ··· 3103 3068 continue; 3104 3069 total_done += amdgpu_bo_print_info(id++, bo_va->base.bo, m); 3105 3070 } 3071 + spin_unlock(&vm->status_lock); 3106 3072 total_done_objs = id; 3107 3073 3108 3074 seq_printf(m, "\tTotal idle size: %12lld\tobjs:\t%d\n", total_idle,
+6 -9
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
··· 203 203 /* protected by bo being reserved */ 204 204 struct amdgpu_vm_bo_base *next; 205 205 206 - /* protected by vm reservation and invalidated_lock */ 206 + /* protected by vm status_lock */ 207 207 struct list_head vm_status; 208 208 209 209 /* if the bo is counted as shared in mem stats 210 - * protected by vm BO being reserved */ 210 + * protected by vm status_lock */ 211 211 bool shared; 212 212 213 213 /* protected by the BO being reserved */ ··· 343 343 bool evicting; 344 344 unsigned int saved_flags; 345 345 346 - /* Memory statistics for this vm, protected by stats_lock */ 347 - spinlock_t stats_lock; 346 + /* Lock to protect vm_bo add/del/move on all lists of vm */ 347 + spinlock_t status_lock; 348 + 349 + /* Memory statistics for this vm, protected by status_lock */ 348 350 struct amdgpu_mem_stats stats[__AMDGPU_PL_NUM]; 349 351 350 352 /* ··· 354 352 * PDs, PTs or per VM BOs. The state transits are: 355 353 * 356 354 * evicted -> relocated (PDs, PTs) or moved (per VM BOs) -> idle 357 - * 358 - * Lists are protected by the root PD dma_resv lock. 359 355 */ 360 356 361 357 /* Per-VM and PT BOs who needs a validation */ ··· 374 374 * state transits are: 375 375 * 376 376 * evicted_user or invalidated -> done 377 - * 378 - * Lists are protected by the invalidated_lock. 379 377 */ 380 - spinlock_t invalidated_lock; 381 378 382 379 /* BOs for user mode queues that need a validation */ 383 380 struct list_head evicted_user;
+4
drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
··· 543 543 entry->bo->vm_bo = NULL; 544 544 ttm_bo_set_bulk_move(&entry->bo->tbo, NULL); 545 545 546 + spin_lock(&entry->vm->status_lock); 546 547 list_del(&entry->vm_status); 548 + spin_unlock(&entry->vm->status_lock); 547 549 amdgpu_bo_unref(&entry->bo); 548 550 } 549 551 ··· 589 587 struct amdgpu_vm_pt_cursor seek; 590 588 struct amdgpu_vm_bo_base *entry; 591 589 590 + spin_lock(&params->vm->status_lock); 592 591 for_each_amdgpu_vm_pt_dfs_safe(params->adev, params->vm, cursor, seek, entry) { 593 592 if (entry && entry->bo) 594 593 list_move(&entry->vm_status, &params->tlb_flush_waitlist); ··· 597 594 598 595 /* enter start node now */ 599 596 list_move(&cursor->entry->vm_status, &params->tlb_flush_waitlist); 597 + spin_unlock(&params->vm->status_lock); 600 598 } 601 599 602 600 /**