drm/xe/xe2: Introduce identity map for compressed pat for vram

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Xe2+ has unified compression (exactly one compression mode/format),
where compression is now controlled via PAT at PTE level.
This simplifies KMD operations, as it can now decompress freely
without concern for the buffer's original compression format—unlike DG2,
which had multiple compression formats and thus required copying the
raw CCS state during VRAM eviction. In addition mixed VRAM and system
memory buffers were not supported with compression enabled.

On Xe2 dGPU compression is still only supported with VRAM, however we
can now support compression with VRAM and system memory buffers,
with GPU access being seamless underneath. So long as when doing
VRAM -> system memory the KMD uses compressed -> uncompressed,
to decompress it. This also allows CPU access to such buffers,
assuming that userspace first decompress the corresponding
pages being accessed.
If the pages are already in system memory then KMD would have already
decompressed them. When restoring such buffers with sysmem -> VRAM
the KMD can't easily know which pages were originally compressed,
so we always use uncompressed -> uncompressed here.
With this it also means we can drop all the raw CCS handling on such
platforms (including needing to allocate extra CCS storage).

In order to support this we now need to have two different identity
mappings for compressed and uncompressed VRAM.
In this patch, we set up the additional identity map for the VRAM with
compressed pat_index. We then select the appropriate mapping during
migration/clear. During eviction (vram->sysmem), we use the mapping
from compressed -> uncompressed. During restore (sysmem->vram), we need
the mapping from uncompressed -> uncompressed.
Therefore, we need to have two different mappings for compressed and
uncompressed vram. We set up an additional identity map for the vram
with compressed pat_index.
We then select the appropriate mapping during migration/clear.

v2: Formatting nits, Updated code to match recent changes in
xe_migrate_prepare_vm(). (Matt)

v3: Move identity map loop to a helper function. (Matt Brost)

v4: Split helper function in different patch, and
add asserts and nits. (Matt Brost)

v5: Convert the 2 bool arguments of pte_update_size to flags
argument (Matt Brost)

v6: Formatting nits (Matt Brost)

Signed-off-by: Akshata Jahagirdar <akshata.jahagirdar@intel.com>
Reviewed-by: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
Signed-off-by: Matt Roper <matthew.d.roper@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/b00db5c7267e54260cb6183ba24b15c1e6ae52a3.1721250309.git.akshata.jahagirdar@intel.com

authored by

Akshata Jahagirdar and committed by

Matt Roper 2 years ago 2b808d6b 8d79acd5

+66 -24

2 changed files

expand all

drivers

gpu

drm

tests

xe_migrate.c

+7 -2

drivers/gpu/drm/xe/tests/xe_migrate.c

··· 393 393 u32 flush_flags = 0; 394 394 u32 update_idx; 395 395 u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE; 396 + u32 pte_flags; 396 397 397 398 src_L0 = xe_migrate_res_sizes(m, &src_it); 398 399 dst_L0 = xe_migrate_res_sizes(m, &dst_it); 399 400 400 401 src_L0 = min(src_L0, dst_L0); 401 402 402 - batch_size += pte_update_size(m, src_is_vram, src_is_vram, src, &src_it, &src_L0, 403 + pte_flags = src_is_vram ? (PTE_UPDATE_FLAG_IS_VRAM | 404 + PTE_UPDATE_FLAG_IS_COMP_PTE) : 0; 405 + batch_size += pte_update_size(m, pte_flags, src, &src_it, &src_L0, 403 406 &src_L0_ofs, &src_L0_pt, 0, 0, 404 407 avail_pts); 405 408 406 - batch_size += pte_update_size(m, dst_is_vram, dst_is_vram, dst, &dst_it, &src_L0, 409 + pte_flags = dst_is_vram ? (PTE_UPDATE_FLAG_IS_VRAM | 410 + PTE_UPDATE_FLAG_IS_COMP_PTE) : 0; 411 + batch_size += pte_update_size(m, pte_flags, dst, &dst_it, &src_L0, 407 412 &dst_L0_ofs, &dst_L0_pt, 0, 408 413 avail_pts, avail_pts); 409 414

+59 -22

drivers/gpu/drm/xe/xe_migrate.c

··· 73 73 #define NUM_PT_SLOTS 32 74 74 #define LEVEL0_PAGE_TABLE_ENCODE_SIZE SZ_2M 75 75 #define MAX_NUM_PTE 512 76 + #define IDENTITY_OFFSET 256ULL 76 77 77 78 /* 78 79 * Although MI_STORE_DATA_IMM's "length" field is 10-bits, 0x3FE is the largest ··· 121 120 return (slot + 1ULL) << xe_pt_shift(level + 1); 122 121 } 123 122 124 - static u64 xe_migrate_vram_ofs(struct xe_device *xe, u64 addr) 123 + static u64 xe_migrate_vram_ofs(struct xe_device *xe, u64 addr, bool is_comp_pte) 125 124 { 126 125 /* 127 126 * Remove the DPA to get a correct offset into identity table for the 128 127 * migrate offset 129 128 */ 129 + u64 identity_offset = IDENTITY_OFFSET; 130 + 131 + if (GRAPHICS_VER(xe) >= 20 && is_comp_pte) 132 + identity_offset += DIV_ROUND_UP_ULL(xe->mem.vram.actual_physical_size, SZ_1G); 133 + 130 134 addr -= xe->mem.vram.dpa_base; 131 - return addr + (256ULL << xe_pt_shift(2)); 135 + return addr + (identity_offset << xe_pt_shift(2)); 132 136 } 133 137 134 138 static void xe_migrate_program_identity(struct xe_device *xe, struct xe_vm *vm, struct xe_bo *bo, ··· 187 181 struct xe_device *xe = tile_to_xe(tile); 188 182 u16 pat_index = xe->pat.idx[XE_CACHE_WB]; 189 183 u8 id = tile->id; 190 - u32 num_entries = NUM_PT_SLOTS, num_level = vm->pt_root[id]->level, 191 - num_setup = num_level + 1; 184 + u32 num_entries = NUM_PT_SLOTS, num_level = vm->pt_root[id]->level; 185 + #define VRAM_IDENTITY_MAP_COUNT 2 186 + u32 num_setup = num_level + VRAM_IDENTITY_MAP_COUNT; 187 + #undef VRAM_IDENTITY_MAP_COUNT 192 188 u32 map_ofs, level, i; 193 189 struct xe_bo *bo, *batch = tile->mem.kernel_bb_pool->bo; 194 - u64 entry, pt30_ofs; 190 + u64 entry, pt29_ofs; 195 191 196 192 /* Can't bump NUM_PT_SLOTS too high */ 197 193 BUILD_BUG_ON(NUM_PT_SLOTS > SZ_2M/XE_PAGE_SIZE); ··· 213 205 if (IS_ERR(bo)) 214 206 return PTR_ERR(bo); 215 207 216 - /* PT31 reserved for 2M identity map */ 217 - pt30_ofs = bo->size - 2 * XE_PAGE_SIZE; 218 - entry = vm->pt_ops->pde_encode_bo(bo, pt30_ofs, pat_index); 208 + /* PT30 & PT31 reserved for 2M identity map */ 209 + pt29_ofs = bo->size - 3 * XE_PAGE_SIZE; 210 + entry = vm->pt_ops->pde_encode_bo(bo, pt29_ofs, pat_index); 219 211 xe_pt_write(xe, &vm->pt_root[id]->bo->vmap, 0, entry); 220 212 221 213 map_ofs = (num_entries - num_setup) * XE_PAGE_SIZE; ··· 267 259 } else { 268 260 u64 batch_addr = xe_bo_addr(batch, 0, XE_PAGE_SIZE); 269 261 270 - m->batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr); 262 + m->batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr, false); 271 263 272 264 if (xe->info.has_usm) { 273 265 batch = tile->primary_gt->usm.bb_pool->bo; 274 266 batch_addr = xe_bo_addr(batch, 0, XE_PAGE_SIZE); 275 - m->usm_batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr); 267 + m->usm_batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr, false); 276 268 } 277 269 } 278 270 ··· 306 298 307 299 /* Identity map the entire vram at 256GiB offset */ 308 300 if (IS_DGFX(xe)) { 309 - u64 pt31_ofs = bo->size - XE_PAGE_SIZE; 301 + u64 pt30_ofs = bo->size - 2 * XE_PAGE_SIZE; 310 302 311 - xe_migrate_program_identity(xe, vm, bo, map_ofs, 256, pat_index, pt31_ofs); 312 - xe_assert(xe, (xe->mem.vram.actual_physical_size <= SZ_256G)); 303 + xe_migrate_program_identity(xe, vm, bo, map_ofs, IDENTITY_OFFSET, 304 + pat_index, pt30_ofs); 305 + xe_assert(xe, xe->mem.vram.actual_physical_size <= 306 + (MAX_NUM_PTE - IDENTITY_OFFSET) * SZ_1G); 307 + 308 + /* 309 + * Identity map the entire vram for compressed pat_index for xe2+ 310 + * if flat ccs is enabled. 311 + */ 312 + if (GRAPHICS_VER(xe) >= 20 && xe_device_has_flat_ccs(xe)) { 313 + u16 comp_pat_index = xe->pat.idx[XE_CACHE_NONE_COMPRESSION]; 314 + u64 vram_offset = IDENTITY_OFFSET + 315 + DIV_ROUND_UP_ULL(xe->mem.vram.actual_physical_size, SZ_1G); 316 + u64 pt31_ofs = bo->size - XE_PAGE_SIZE; 317 + 318 + xe_assert(xe, xe->mem.vram.actual_physical_size <= (MAX_NUM_PTE - 319 + IDENTITY_OFFSET - IDENTITY_OFFSET / 2) * SZ_1G); 320 + xe_migrate_program_identity(xe, vm, bo, map_ofs, vram_offset, 321 + comp_pat_index, pt31_ofs); 322 + } 313 323 } 314 324 315 325 /* 316 326 * Example layout created above, with root level = 3: 317 327 * [PT0...PT7]: kernel PT's for copy/clear; 64 or 4KiB PTE's 318 328 * [PT8]: Kernel PT for VM_BIND, 4 KiB PTE's 319 - * [PT9...PT27]: Userspace PT's for VM_BIND, 4 KiB PTE's 320 - * [PT28 = PDE 0] [PT29 = PDE 1] [PT30 = PDE 2] [PT31 = 2M vram identity map] 329 + * [PT9...PT26]: Userspace PT's for VM_BIND, 4 KiB PTE's 330 + * [PT27 = PDE 0] [PT28 = PDE 1] [PT29 = PDE 2] [PT30 & PT31 = 2M vram identity map] 321 331 * 322 332 * This makes the lowest part of the VM point to the pagetables. 323 333 * Hence the lowest 2M in the vm should point to itself, with a few writes ··· 513 487 return cur->size >= size; 514 488 } 515 489 490 + #define PTE_UPDATE_FLAG_IS_VRAM BIT(0) 491 + #define PTE_UPDATE_FLAG_IS_COMP_PTE BIT(1) 492 + 516 493 static u32 pte_update_size(struct xe_migrate *m, 517 - bool is_vram, 494 + u32 flags, 518 495 struct ttm_resource *res, 519 496 struct xe_res_cursor *cur, 520 497 u64 *L0, u64 *L0_ofs, u32 *L0_pt, 521 498 u32 cmd_size, u32 pt_ofs, u32 avail_pts) 522 499 { 523 500 u32 cmds = 0; 501 + bool is_vram = PTE_UPDATE_FLAG_IS_VRAM & flags; 502 + bool is_comp_pte = PTE_UPDATE_FLAG_IS_COMP_PTE & flags; 524 503 525 504 *L0_pt = pt_ofs; 526 505 if (is_vram && xe_migrate_allow_identity(*L0, cur)) { 527 506 /* Offset into identity map. */ 528 507 *L0_ofs = xe_migrate_vram_ofs(tile_to_xe(m->tile), 529 - cur->start + vram_region_gpu_offset(res)); 508 + cur->start + vram_region_gpu_offset(res), 509 + is_comp_pte); 530 510 cmds += cmd_size; 531 511 } else { 532 512 /* Clip L0 to available size */ ··· 811 779 u32 update_idx; 812 780 u64 ccs_ofs, ccs_size; 813 781 u32 ccs_pt; 782 + u32 pte_flags; 814 783 815 784 bool usm = xe->info.has_usm; 816 785 u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE; ··· 824 791 825 792 src_L0 = min(src_L0, dst_L0); 826 793 827 - batch_size += pte_update_size(m, src_is_vram, src, &src_it, &src_L0, 794 + pte_flags = src_is_vram ? PTE_UPDATE_FLAG_IS_VRAM : 0; 795 + batch_size += pte_update_size(m, pte_flags, src, &src_it, &src_L0, 828 796 &src_L0_ofs, &src_L0_pt, 0, 0, 829 797 avail_pts); 830 798 831 - batch_size += pte_update_size(m, dst_is_vram, dst, &dst_it, &src_L0, 799 + pte_flags = dst_is_vram ? PTE_UPDATE_FLAG_IS_VRAM : 0; 800 + batch_size += pte_update_size(m, pte_flags, dst, &dst_it, &src_L0, 832 801 &dst_L0_ofs, &dst_L0_pt, 0, 833 802 avail_pts, avail_pts); 834 803 835 804 if (copy_system_ccs) { 836 805 ccs_size = xe_device_ccs_bytes(xe, src_L0); 837 - batch_size += pte_update_size(m, false, NULL, &ccs_it, &ccs_size, 806 + batch_size += pte_update_size(m, 0, NULL, &ccs_it, &ccs_size, 838 807 &ccs_ofs, &ccs_pt, 0, 839 808 2 * avail_pts, 840 809 avail_pts); ··· 1069 1034 struct xe_sched_job *job; 1070 1035 struct xe_bb *bb; 1071 1036 u32 batch_size, update_idx; 1037 + u32 pte_flags; 1072 1038 1073 1039 bool usm = xe->info.has_usm; 1074 1040 u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE; ··· 1077 1041 clear_L0 = xe_migrate_res_sizes(m, &src_it); 1078 1042 1079 1043 /* Calculate final sizes and batch size.. */ 1044 + pte_flags = clear_vram ? PTE_UPDATE_FLAG_IS_VRAM : 0; 1080 1045 batch_size = 2 + 1081 - pte_update_size(m, clear_vram, src, &src_it, 1046 + pte_update_size(m, pte_flags, src, &src_it, 1082 1047 &clear_L0, &clear_L0_ofs, &clear_L0_pt, 1083 1048 clear_system_ccs ? 0 : emit_clear_cmd_len(gt), 0, 1084 1049 avail_pts); ··· 1196 1159 if (!ppgtt_ofs) 1197 1160 ppgtt_ofs = xe_migrate_vram_ofs(tile_to_xe(tile), 1198 1161 xe_bo_addr(update->pt_bo, 0, 1199 - XE_PAGE_SIZE)); 1162 + XE_PAGE_SIZE), false); 1200 1163 1201 1164 do { 1202 1165 u64 addr = ppgtt_ofs + ofs * 8;