Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdgpu: Set MTYPE in PTE based on BO flags

The same BO may need different MTYPEs and SNOOP flags in PTEs depending
on its current location relative to the mapping GPU. Setting MTYPEs from
clients ahead of time is not practical for coherent memory sharing.
Instead determine the correct MTYPE for the desired coherence model and
current BO location when updating the page tables.

To maintain backwards compatibility with MTYPE-selection in
AMDGPU_VA_OP_MAP, the coherence-model-based MTYPE selection is only
applied if it chooses an MTYPE other than MTYPE_NC (the default).

Add two AMDGPU_GEM_CREATE_... flags to indicate the coherence model. The
default if no flag is specified is non-coherent (i.e. coarse-grained
coherent at dispatch boundaries).

Update amdgpu_amdkfd_gpuvm.c to use this new method to choose the
correct MTYPE depending on the current memory location.

v2:
* check that bo is not NULL (e.g. PRT mappings)
* Fix missing ~ bitmask in gmc_v11_0.c
v3:
* squash in "drm/amdgpu: Inherit coherence flags on dmabuf import"

Suggested-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Felix Kuehling and committed by
Alex Deucher
d1a372af d852871c

+110 -60
+8 -51
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
··· 405 405 406 406 static uint64_t get_pte_flags(struct amdgpu_device *adev, struct kgd_mem *mem) 407 407 { 408 - struct amdgpu_device *bo_adev = amdgpu_ttm_adev(mem->bo->tbo.bdev); 409 - bool coherent = mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_COHERENT; 410 - bool uncached = mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED; 411 - uint32_t mapping_flags; 412 - uint64_t pte_flags; 413 - bool snoop = false; 408 + uint32_t mapping_flags = AMDGPU_VM_PAGE_READABLE | 409 + AMDGPU_VM_MTYPE_DEFAULT; 414 410 415 - mapping_flags = AMDGPU_VM_PAGE_READABLE; 416 411 if (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE) 417 412 mapping_flags |= AMDGPU_VM_PAGE_WRITEABLE; 418 413 if (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE) 419 414 mapping_flags |= AMDGPU_VM_PAGE_EXECUTABLE; 420 415 421 - switch (adev->ip_versions[GC_HWIP][0]) { 422 - case IP_VERSION(9, 4, 1): 423 - case IP_VERSION(9, 4, 2): 424 - if (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) { 425 - if (bo_adev == adev) { 426 - if (uncached) 427 - mapping_flags |= AMDGPU_VM_MTYPE_UC; 428 - else if (coherent) 429 - mapping_flags |= AMDGPU_VM_MTYPE_CC; 430 - else 431 - mapping_flags |= AMDGPU_VM_MTYPE_RW; 432 - if ((adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2)) && 433 - adev->gmc.xgmi.connected_to_cpu) 434 - snoop = true; 435 - } else { 436 - if (uncached || coherent) 437 - mapping_flags |= AMDGPU_VM_MTYPE_UC; 438 - else 439 - mapping_flags |= AMDGPU_VM_MTYPE_NC; 440 - if (amdgpu_xgmi_same_hive(adev, bo_adev)) 441 - snoop = true; 442 - } 443 - } else { 444 - if (uncached || coherent) 445 - mapping_flags |= AMDGPU_VM_MTYPE_UC; 446 - else 447 - mapping_flags |= AMDGPU_VM_MTYPE_NC; 448 - snoop = true; 449 - } 450 - break; 451 - default: 452 - if (uncached || coherent) 453 - mapping_flags |= AMDGPU_VM_MTYPE_UC; 454 - else 455 - mapping_flags |= AMDGPU_VM_MTYPE_NC; 456 - 457 - if (!(mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM)) 458 - snoop = true; 459 - } 460 - 461 - pte_flags = amdgpu_gem_va_map_flags(adev, mapping_flags); 462 - pte_flags |= snoop ? AMDGPU_PTE_SNOOPED : 0; 463 - 464 - return pte_flags; 416 + return amdgpu_gem_va_map_flags(adev, mapping_flags); 465 417 } 466 418 467 419 /** ··· 1624 1672 return -EINVAL; 1625 1673 } 1626 1674 } 1675 + 1676 + if (flags & KFD_IOC_ALLOC_MEM_FLAGS_COHERENT) 1677 + alloc_flags |= AMDGPU_GEM_CREATE_COHERENT; 1678 + if (flags & KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED) 1679 + alloc_flags |= AMDGPU_GEM_CREATE_UNCACHED; 1627 1680 1628 1681 *mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL); 1629 1682 if (!*mem) {
+3 -1
drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
··· 328 328 if (dma_buf->ops == &amdgpu_dmabuf_ops) { 329 329 struct amdgpu_bo *other = gem_to_amdgpu_bo(dma_buf->priv); 330 330 331 - flags |= other->flags & AMDGPU_GEM_CREATE_CPU_GTT_USWC; 331 + flags |= other->flags & (AMDGPU_GEM_CREATE_CPU_GTT_USWC | 332 + AMDGPU_GEM_CREATE_COHERENT | 333 + AMDGPU_GEM_CREATE_UNCACHED); 332 334 } 333 335 334 336 ret = amdgpu_gem_object_create(adev, dma_buf->size, PAGE_SIZE,
+7
drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
··· 612 612 struct amdgpu_bo_va_mapping *mapping, 613 613 uint64_t *flags) 614 614 { 615 + struct amdgpu_bo *bo = mapping->bo_va->base.bo; 616 + 615 617 *flags &= ~AMDGPU_PTE_EXECUTABLE; 616 618 *flags |= mapping->flags & AMDGPU_PTE_EXECUTABLE; 617 619 ··· 630 628 *flags |= AMDGPU_PTE_SYSTEM; 631 629 *flags &= ~AMDGPU_PTE_VALID; 632 630 } 631 + 632 + if (bo && bo->flags & (AMDGPU_GEM_CREATE_COHERENT | 633 + AMDGPU_GEM_CREATE_UNCACHED)) 634 + *flags = (*flags & ~AMDGPU_PTE_MTYPE_NV10_MASK) | 635 + AMDGPU_PTE_MTYPE_NV10(MTYPE_UC); 633 636 } 634 637 635 638 static unsigned gmc_v10_0_get_vbios_fb_size(struct amdgpu_device *adev)
+7
drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
··· 503 503 struct amdgpu_bo_va_mapping *mapping, 504 504 uint64_t *flags) 505 505 { 506 + struct amdgpu_bo *bo = mapping->bo_va->base.bo; 507 + 506 508 *flags &= ~AMDGPU_PTE_EXECUTABLE; 507 509 *flags |= mapping->flags & AMDGPU_PTE_EXECUTABLE; 508 510 ··· 521 519 *flags |= AMDGPU_PTE_SYSTEM; 522 520 *flags &= ~AMDGPU_PTE_VALID; 523 521 } 522 + 523 + if (bo && bo->flags & (AMDGPU_GEM_CREATE_COHERENT | 524 + AMDGPU_GEM_CREATE_UNCACHED)) 525 + *flags = (*flags & ~AMDGPU_PTE_MTYPE_NV10_MASK) | 526 + AMDGPU_PTE_MTYPE_NV10(MTYPE_UC); 524 527 } 525 528 526 529 static unsigned gmc_v11_0_get_vbios_fb_size(struct amdgpu_device *adev)
+71 -8
drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
··· 1113 1113 } 1114 1114 } 1115 1115 1116 + static void gmc_v9_0_get_coherence_flags(struct amdgpu_device *adev, 1117 + struct amdgpu_bo *bo, 1118 + struct amdgpu_bo_va_mapping *mapping, 1119 + uint64_t *flags) 1120 + { 1121 + struct amdgpu_device *bo_adev = amdgpu_ttm_adev(bo->tbo.bdev); 1122 + bool is_vram = bo->tbo.resource->mem_type == TTM_PL_VRAM; 1123 + bool coherent = bo->flags & AMDGPU_GEM_CREATE_COHERENT; 1124 + bool uncached = bo->flags & AMDGPU_GEM_CREATE_UNCACHED; 1125 + unsigned int mtype; 1126 + bool snoop = false; 1127 + 1128 + switch (adev->ip_versions[GC_HWIP][0]) { 1129 + case IP_VERSION(9, 4, 1): 1130 + case IP_VERSION(9, 4, 2): 1131 + if (is_vram) { 1132 + if (bo_adev == adev) { 1133 + if (uncached) 1134 + mtype = MTYPE_UC; 1135 + else if (coherent) 1136 + mtype = MTYPE_CC; 1137 + else 1138 + mtype = MTYPE_RW; 1139 + /* FIXME: is this still needed? Or does 1140 + * amdgpu_ttm_tt_pde_flags already handle this? 1141 + */ 1142 + if (adev->ip_versions[GC_HWIP][0] == 1143 + IP_VERSION(9, 4, 2) && 1144 + adev->gmc.xgmi.connected_to_cpu) 1145 + snoop = true; 1146 + } else { 1147 + if (uncached || coherent) 1148 + mtype = MTYPE_UC; 1149 + else 1150 + mtype = MTYPE_NC; 1151 + if (mapping->bo_va->is_xgmi) 1152 + snoop = true; 1153 + } 1154 + } else { 1155 + if (uncached || coherent) 1156 + mtype = MTYPE_UC; 1157 + else 1158 + mtype = MTYPE_NC; 1159 + /* FIXME: is this still needed? Or does 1160 + * amdgpu_ttm_tt_pde_flags already handle this? 1161 + */ 1162 + snoop = true; 1163 + } 1164 + break; 1165 + default: 1166 + if (uncached || coherent) 1167 + mtype = MTYPE_UC; 1168 + else 1169 + mtype = MTYPE_NC; 1170 + 1171 + /* FIXME: is this still needed? Or does 1172 + * amdgpu_ttm_tt_pde_flags already handle this? 1173 + */ 1174 + if (!is_vram) 1175 + snoop = true; 1176 + } 1177 + 1178 + if (mtype != MTYPE_NC) 1179 + *flags = (*flags & ~AMDGPU_PTE_MTYPE_VG10_MASK) | 1180 + AMDGPU_PTE_MTYPE_VG10(mtype); 1181 + *flags |= snoop ? AMDGPU_PTE_SNOOPED : 0; 1182 + } 1183 + 1116 1184 static void gmc_v9_0_get_vm_pte(struct amdgpu_device *adev, 1117 1185 struct amdgpu_bo_va_mapping *mapping, 1118 1186 uint64_t *flags) ··· 1196 1128 *flags &= ~AMDGPU_PTE_VALID; 1197 1129 } 1198 1130 1199 - if ((adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 1) || 1200 - adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2)) && 1201 - !(*flags & AMDGPU_PTE_SYSTEM) && 1202 - mapping->bo_va->is_xgmi) 1203 - *flags |= AMDGPU_PTE_SNOOPED; 1204 - 1205 - if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2)) 1206 - *flags |= mapping->flags & AMDGPU_PTE_SNOOPED; 1131 + if (mapping->bo_va->base.bo) 1132 + gmc_v9_0_get_coherence_flags(adev, mapping->bo_va->base.bo, 1133 + mapping, flags); 1207 1134 } 1208 1135 1209 1136 static unsigned gmc_v9_0_get_vbios_fb_size(struct amdgpu_device *adev)
+14
include/uapi/drm/amdgpu_drm.h
··· 144 144 * content. 145 145 */ 146 146 #define AMDGPU_GEM_CREATE_DISCARDABLE (1 << 12) 147 + /* Flag that BO is shared coherently between multiple devices or CPU threads. 148 + * May depend on GPU instructions to flush caches explicitly 149 + * 150 + * This influences the choice of MTYPE in the PTEs on GFXv9 and later GPUs and 151 + * may override the MTYPE selected in AMDGPU_VA_OP_MAP. 152 + */ 153 + #define AMDGPU_GEM_CREATE_COHERENT (1 << 13) 154 + /* Flag that BO should not be cached by GPU. Coherent without having to flush 155 + * GPU caches explicitly 156 + * 157 + * This influences the choice of MTYPE in the PTEs on GFXv9 and later GPUs and 158 + * may override the MTYPE selected in AMDGPU_VA_OP_MAP. 159 + */ 160 + #define AMDGPU_GEM_CREATE_UNCACHED (1 << 14) 147 161 148 162 struct drm_amdgpu_gem_create_in { 149 163 /** the requested memory size */