Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/ttm/radeon/nouveau: Kill the bo lock in favour of a bo device fence_lock

The bo lock used only to protect the bo sync object members, and since it
is a per bo lock, fencing a buffer list will see a lot of locks and unlocks.
Replace it with a per-device lock that protects the sync object members on
*all* bos. Reading and setting these members will always be very quick, so
the risc of heavy lock contention is microscopic. Note that waiting for
sync objects will always take place outside of this lock.

The bo device fence lock will eventually be replaced with a seqlock /
rcu mechanism so we can determine that a bo is idle under a
rcu / read seqlock.

However this change will allow us to batch fencing and unreserving of
buffers with a minimal amount of locking.

Signed-off-by: Thomas Hellstrom <thellstrom@vmware.com>
Reviewed-by: Jerome Glisse <j.glisse@gmail.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>

authored by

Thomas Hellstrom and committed by
Dave Airlie
702adba2 96726fe5

+53 -51
+6 -6
drivers/gpu/drm/nouveau/nouveau_gem.c
··· 234 234 if (likely(fence)) { 235 235 struct nouveau_fence *prev_fence; 236 236 237 - spin_lock(&nvbo->bo.lock); 237 + spin_lock(&nvbo->bo.bdev->fence_lock); 238 238 prev_fence = nvbo->bo.sync_obj; 239 239 nvbo->bo.sync_obj = nouveau_fence_ref(fence); 240 - spin_unlock(&nvbo->bo.lock); 240 + spin_unlock(&nvbo->bo.bdev->fence_lock); 241 241 nouveau_fence_unref((void *)&prev_fence); 242 242 } 243 243 ··· 557 557 data |= r->vor; 558 558 } 559 559 560 - spin_lock(&nvbo->bo.lock); 560 + spin_lock(&nvbo->bo.bdev->fence_lock); 561 561 ret = ttm_bo_wait(&nvbo->bo, false, false, false); 562 - spin_unlock(&nvbo->bo.lock); 562 + spin_unlock(&nvbo->bo.bdev->fence_lock); 563 563 if (ret) { 564 564 NV_ERROR(dev, "reloc wait_idle failed: %d\n", ret); 565 565 break; ··· 791 791 } 792 792 793 793 if (req->flags & NOUVEAU_GEM_CPU_PREP_NOBLOCK) { 794 - spin_lock(&nvbo->bo.lock); 794 + spin_lock(&nvbo->bo.bdev->fence_lock); 795 795 ret = ttm_bo_wait(&nvbo->bo, false, false, no_wait); 796 - spin_unlock(&nvbo->bo.lock); 796 + spin_unlock(&nvbo->bo.bdev->fence_lock); 797 797 } else { 798 798 ret = ttm_bo_synccpu_write_grab(&nvbo->bo, no_wait); 799 799 if (ret == 0)
+2 -2
drivers/gpu/drm/radeon/radeon_object.c
··· 369 369 370 370 list_for_each_entry(lobj, head, list) { 371 371 bo = lobj->bo; 372 - spin_lock(&bo->tbo.lock); 372 + spin_lock(&bo->tbo.bdev->fence_lock); 373 373 old_fence = (struct radeon_fence *)bo->tbo.sync_obj; 374 374 bo->tbo.sync_obj = radeon_fence_ref(fence); 375 375 bo->tbo.sync_obj_arg = NULL; 376 - spin_unlock(&bo->tbo.lock); 376 + spin_unlock(&bo->tbo.bdev->fence_lock); 377 377 if (old_fence) { 378 378 radeon_fence_unref(&old_fence); 379 379 }
+2 -2
drivers/gpu/drm/radeon/radeon_object.h
··· 126 126 r = ttm_bo_reserve(&bo->tbo, true, no_wait, false, 0); 127 127 if (unlikely(r != 0)) 128 128 return r; 129 - spin_lock(&bo->tbo.lock); 129 + spin_lock(&bo->tbo.bdev->fence_lock); 130 130 if (mem_type) 131 131 *mem_type = bo->tbo.mem.mem_type; 132 132 if (bo->tbo.sync_obj) 133 133 r = ttm_bo_wait(&bo->tbo, true, true, no_wait); 134 - spin_unlock(&bo->tbo.lock); 134 + spin_unlock(&bo->tbo.bdev->fence_lock); 135 135 ttm_bo_unreserve(&bo->tbo); 136 136 return r; 137 137 }
+28 -27
drivers/gpu/drm/ttm/ttm_bo.c
··· 427 427 } 428 428 429 429 if (bo->mem.mm_node) { 430 - spin_lock(&bo->lock); 431 430 bo->offset = (bo->mem.start << PAGE_SHIFT) + 432 431 bdev->man[bo->mem.mem_type].gpu_offset; 433 432 bo->cur_placement = bo->mem.placement; 434 - spin_unlock(&bo->lock); 435 433 } else 436 434 bo->offset = 0; 437 435 ··· 483 485 int put_count; 484 486 int ret; 485 487 486 - spin_lock(&bo->lock); 488 + spin_lock(&bdev->fence_lock); 487 489 (void) ttm_bo_wait(bo, false, false, true); 488 490 if (!bo->sync_obj) { 489 491 490 492 spin_lock(&glob->lru_lock); 491 493 492 494 /** 493 - * Lock inversion between bo::reserve and bo::lock here, 495 + * Lock inversion between bo:reserve and bdev::fence_lock here, 494 496 * but that's OK, since we're only trylocking. 495 497 */ 496 498 ··· 499 501 if (unlikely(ret == -EBUSY)) 500 502 goto queue; 501 503 502 - spin_unlock(&bo->lock); 504 + spin_unlock(&bdev->fence_lock); 503 505 put_count = ttm_bo_del_from_lru(bo); 504 506 505 507 spin_unlock(&glob->lru_lock); ··· 520 522 kref_get(&bo->list_kref); 521 523 list_add_tail(&bo->ddestroy, &bdev->ddestroy); 522 524 spin_unlock(&glob->lru_lock); 523 - spin_unlock(&bo->lock); 525 + spin_unlock(&bdev->fence_lock); 524 526 525 527 if (sync_obj) { 526 528 driver->sync_obj_flush(sync_obj, sync_obj_arg); ··· 545 547 bool no_wait_reserve, 546 548 bool no_wait_gpu) 547 549 { 550 + struct ttm_bo_device *bdev = bo->bdev; 548 551 struct ttm_bo_global *glob = bo->glob; 549 552 int put_count; 550 553 int ret = 0; 551 554 552 555 retry: 553 - spin_lock(&bo->lock); 556 + spin_lock(&bdev->fence_lock); 554 557 ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu); 555 - spin_unlock(&bo->lock); 558 + spin_unlock(&bdev->fence_lock); 556 559 557 560 if (unlikely(ret != 0)) 558 561 return ret; ··· 706 707 struct ttm_placement placement; 707 708 int ret = 0; 708 709 709 - spin_lock(&bo->lock); 710 + spin_lock(&bdev->fence_lock); 710 711 ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu); 711 - spin_unlock(&bo->lock); 712 + spin_unlock(&bdev->fence_lock); 712 713 713 714 if (unlikely(ret != 0)) { 714 715 if (ret != -ERESTARTSYS) { ··· 1043 1044 { 1044 1045 int ret = 0; 1045 1046 struct ttm_mem_reg mem; 1047 + struct ttm_bo_device *bdev = bo->bdev; 1046 1048 1047 1049 BUG_ON(!atomic_read(&bo->reserved)); 1048 1050 ··· 1052 1052 * Have the driver move function wait for idle when necessary, 1053 1053 * instead of doing it here. 1054 1054 */ 1055 - spin_lock(&bo->lock); 1055 + spin_lock(&bdev->fence_lock); 1056 1056 ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu); 1057 - spin_unlock(&bo->lock); 1057 + spin_unlock(&bdev->fence_lock); 1058 1058 if (ret) 1059 1059 return ret; 1060 1060 mem.num_pages = bo->num_pages; ··· 1171 1171 } 1172 1172 bo->destroy = destroy; 1173 1173 1174 - spin_lock_init(&bo->lock); 1175 1174 kref_init(&bo->kref); 1176 1175 kref_init(&bo->list_kref); 1177 1176 atomic_set(&bo->cpu_writers, 0); ··· 1534 1535 bdev->dev_mapping = NULL; 1535 1536 bdev->glob = glob; 1536 1537 bdev->need_dma32 = need_dma32; 1537 - 1538 + spin_lock_init(&bdev->fence_lock); 1538 1539 mutex_lock(&glob->device_list_mutex); 1539 1540 list_add_tail(&bdev->device_list, &glob->device_list); 1540 1541 mutex_unlock(&glob->device_list_mutex); ··· 1658 1659 bool lazy, bool interruptible, bool no_wait) 1659 1660 { 1660 1661 struct ttm_bo_driver *driver = bo->bdev->driver; 1662 + struct ttm_bo_device *bdev = bo->bdev; 1661 1663 void *sync_obj; 1662 1664 void *sync_obj_arg; 1663 1665 int ret = 0; ··· 1672 1672 void *tmp_obj = bo->sync_obj; 1673 1673 bo->sync_obj = NULL; 1674 1674 clear_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags); 1675 - spin_unlock(&bo->lock); 1675 + spin_unlock(&bdev->fence_lock); 1676 1676 driver->sync_obj_unref(&tmp_obj); 1677 - spin_lock(&bo->lock); 1677 + spin_lock(&bdev->fence_lock); 1678 1678 continue; 1679 1679 } 1680 1680 ··· 1683 1683 1684 1684 sync_obj = driver->sync_obj_ref(bo->sync_obj); 1685 1685 sync_obj_arg = bo->sync_obj_arg; 1686 - spin_unlock(&bo->lock); 1686 + spin_unlock(&bdev->fence_lock); 1687 1687 ret = driver->sync_obj_wait(sync_obj, sync_obj_arg, 1688 1688 lazy, interruptible); 1689 1689 if (unlikely(ret != 0)) { 1690 1690 driver->sync_obj_unref(&sync_obj); 1691 - spin_lock(&bo->lock); 1691 + spin_lock(&bdev->fence_lock); 1692 1692 return ret; 1693 1693 } 1694 - spin_lock(&bo->lock); 1694 + spin_lock(&bdev->fence_lock); 1695 1695 if (likely(bo->sync_obj == sync_obj && 1696 1696 bo->sync_obj_arg == sync_obj_arg)) { 1697 1697 void *tmp_obj = bo->sync_obj; 1698 1698 bo->sync_obj = NULL; 1699 1699 clear_bit(TTM_BO_PRIV_FLAG_MOVING, 1700 1700 &bo->priv_flags); 1701 - spin_unlock(&bo->lock); 1701 + spin_unlock(&bdev->fence_lock); 1702 1702 driver->sync_obj_unref(&sync_obj); 1703 1703 driver->sync_obj_unref(&tmp_obj); 1704 - spin_lock(&bo->lock); 1704 + spin_lock(&bdev->fence_lock); 1705 1705 } else { 1706 - spin_unlock(&bo->lock); 1706 + spin_unlock(&bdev->fence_lock); 1707 1707 driver->sync_obj_unref(&sync_obj); 1708 - spin_lock(&bo->lock); 1708 + spin_lock(&bdev->fence_lock); 1709 1709 } 1710 1710 } 1711 1711 return 0; ··· 1714 1714 1715 1715 int ttm_bo_synccpu_write_grab(struct ttm_buffer_object *bo, bool no_wait) 1716 1716 { 1717 + struct ttm_bo_device *bdev = bo->bdev; 1717 1718 int ret = 0; 1718 1719 1719 1720 /* ··· 1724 1723 ret = ttm_bo_reserve(bo, true, no_wait, false, 0); 1725 1724 if (unlikely(ret != 0)) 1726 1725 return ret; 1727 - spin_lock(&bo->lock); 1726 + spin_lock(&bdev->fence_lock); 1728 1727 ret = ttm_bo_wait(bo, false, true, no_wait); 1729 - spin_unlock(&bo->lock); 1728 + spin_unlock(&bdev->fence_lock); 1730 1729 if (likely(ret == 0)) 1731 1730 atomic_inc(&bo->cpu_writers); 1732 1731 ttm_bo_unreserve(bo); ··· 1798 1797 * Wait for GPU, then move to system cached. 1799 1798 */ 1800 1799 1801 - spin_lock(&bo->lock); 1800 + spin_lock(&bo->bdev->fence_lock); 1802 1801 ret = ttm_bo_wait(bo, false, false, false); 1803 - spin_unlock(&bo->lock); 1802 + spin_unlock(&bo->bdev->fence_lock); 1804 1803 1805 1804 if (unlikely(ret != 0)) 1806 1805 goto out;
+3 -4
drivers/gpu/drm/ttm/ttm_bo_util.c
··· 337 337 * TODO: Explicit member copy would probably be better here. 338 338 */ 339 339 340 - spin_lock_init(&fbo->lock); 341 340 init_waitqueue_head(&fbo->event_queue); 342 341 INIT_LIST_HEAD(&fbo->ddestroy); 343 342 INIT_LIST_HEAD(&fbo->lru); ··· 519 520 struct ttm_buffer_object *ghost_obj; 520 521 void *tmp_obj = NULL; 521 522 522 - spin_lock(&bo->lock); 523 + spin_lock(&bdev->fence_lock); 523 524 if (bo->sync_obj) { 524 525 tmp_obj = bo->sync_obj; 525 526 bo->sync_obj = NULL; ··· 528 529 bo->sync_obj_arg = sync_obj_arg; 529 530 if (evict) { 530 531 ret = ttm_bo_wait(bo, false, false, false); 531 - spin_unlock(&bo->lock); 532 + spin_unlock(&bdev->fence_lock); 532 533 if (tmp_obj) 533 534 driver->sync_obj_unref(&tmp_obj); 534 535 if (ret) ··· 551 552 */ 552 553 553 554 set_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags); 554 - spin_unlock(&bo->lock); 555 + spin_unlock(&bdev->fence_lock); 555 556 if (tmp_obj) 556 557 driver->sync_obj_unref(&tmp_obj); 557 558
+3 -3
drivers/gpu/drm/ttm/ttm_bo_vm.c
··· 118 118 * move. 119 119 */ 120 120 121 - spin_lock(&bo->lock); 121 + spin_lock(&bdev->fence_lock); 122 122 if (test_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags)) { 123 123 ret = ttm_bo_wait(bo, false, true, false); 124 - spin_unlock(&bo->lock); 124 + spin_unlock(&bdev->fence_lock); 125 125 if (unlikely(ret != 0)) { 126 126 retval = (ret != -ERESTARTSYS) ? 127 127 VM_FAULT_SIGBUS : VM_FAULT_NOPAGE; 128 128 goto out_unlock; 129 129 } 130 130 } else 131 - spin_unlock(&bo->lock); 131 + spin_unlock(&bdev->fence_lock); 132 132 133 133 134 134 ret = ttm_mem_io_reserve(bdev, &bo->mem);
+4 -3
drivers/gpu/drm/ttm/ttm_execbuf_util.c
··· 203 203 204 204 list_for_each_entry(entry, list, head) { 205 205 struct ttm_buffer_object *bo = entry->bo; 206 - struct ttm_bo_driver *driver = bo->bdev->driver; 206 + struct ttm_bo_device *bdev = bo->bdev; 207 + struct ttm_bo_driver *driver = bdev->driver; 207 208 void *old_sync_obj; 208 209 209 - spin_lock(&bo->lock); 210 + spin_lock(&bdev->fence_lock); 210 211 old_sync_obj = bo->sync_obj; 211 212 bo->sync_obj = driver->sync_obj_ref(sync_obj); 212 213 bo->sync_obj_arg = entry->new_sync_obj_arg; 213 - spin_unlock(&bo->lock); 214 + spin_unlock(&bdev->fence_lock); 214 215 ttm_bo_unreserve(bo); 215 216 entry->reserved = false; 216 217 if (old_sync_obj)
+2 -4
include/drm/ttm/ttm_bo_api.h
··· 154 154 * keeps one refcount. When this refcount reaches zero, 155 155 * the object is destroyed. 156 156 * @event_queue: Queue for processes waiting on buffer object status change. 157 - * @lock: spinlock protecting mostly synchronization members. 158 157 * @mem: structure describing current placement. 159 158 * @persistant_swap_storage: Usually the swap storage is deleted for buffers 160 159 * pinned in physical memory. If this behaviour is not desired, this member ··· 212 213 struct kref kref; 213 214 struct kref list_kref; 214 215 wait_queue_head_t event_queue; 215 - spinlock_t lock; 216 216 217 217 /** 218 218 * Members protected by the bo::reserved lock. ··· 246 248 atomic_t reserved; 247 249 248 250 /** 249 - * Members protected by the bo::lock 251 + * Members protected by struct buffer_object_device::fence_lock 250 252 * In addition, setting sync_obj to anything else 251 253 * than NULL requires bo::reserved to be held. This allows for 252 - * checking NULL while reserved but not holding bo::lock. 254 + * checking NULL while reserved but not holding the mentioned lock. 253 255 */ 254 256 255 257 void *sync_obj_arg;
+3
include/drm/ttm/ttm_bo_driver.h
··· 510 510 * 511 511 * @driver: Pointer to a struct ttm_bo_driver struct setup by the driver. 512 512 * @man: An array of mem_type_managers. 513 + * @fence_lock: Protects the synchronizing members on *all* bos belonging 514 + * to this device. 513 515 * @addr_space_mm: Range manager for the device address space. 514 516 * lru_lock: Spinlock that protects the buffer+device lru lists and 515 517 * ddestroy lists. ··· 533 531 struct ttm_bo_driver *driver; 534 532 rwlock_t vm_lock; 535 533 struct ttm_mem_type_manager man[TTM_NUM_MEM_TYPES]; 534 + spinlock_t fence_lock; 536 535 /* 537 536 * Protected by the vm lock. 538 537 */