iommu/tegra: gart: Optimize mapping / unmapping performance

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Currently GART writes one page entry at a time. More optimal would be to
aggregate the writes and flush BUS buffer in the end, this gives map/unmap
10-40% performance boost (depending on size of mapping) in comparison to
flushing after each page entry update.

Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Acked-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>

authored by

Dmitry Osipenko and committed by

Joerg Roedel 7 years ago 2fc0ac18 1d7ae53b

+10 -2

1 changed file

expand all

drivers

iommu

tegra-gart.c

+10 -2

drivers/iommu/tegra-gart.c

··· 290 290 } 291 291 } 292 292 gart_set_pte(gart, iova, GART_PTE(pfn)); 293 - FLUSH_GART_REGS(gart); 294 293 spin_unlock_irqrestore(&gart->pte_lock, flags); 295 294 return 0; 296 295 } ··· 306 307 307 308 spin_lock_irqsave(&gart->pte_lock, flags); 308 309 gart_set_pte(gart, iova, 0); 309 - FLUSH_GART_REGS(gart); 310 310 spin_unlock_irqrestore(&gart->pte_lock, flags); 311 311 return bytes; 312 312 } ··· 371 373 return 0; 372 374 } 373 375 376 + static void gart_iommu_sync(struct iommu_domain *domain) 377 + { 378 + struct gart_domain *gart_domain = to_gart_domain(domain); 379 + struct gart_device *gart = gart_domain->gart; 380 + 381 + FLUSH_GART_REGS(gart); 382 + } 383 + 374 384 static const struct iommu_ops gart_iommu_ops = { 375 385 .capable = gart_iommu_capable, 376 386 .domain_alloc = gart_iommu_domain_alloc, ··· 393 387 .iova_to_phys = gart_iommu_iova_to_phys, 394 388 .pgsize_bitmap = GART_IOMMU_PGSIZES, 395 389 .of_xlate = gart_iommu_of_xlate, 390 + .iotlb_sync_map = gart_iommu_sync, 391 + .iotlb_sync = gart_iommu_sync, 396 392 }; 397 393 398 394 static int tegra_gart_suspend(struct device *dev)