Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/xe/migrate: support MEM_COPY instruction

Make this the default on xe2+ when doing a copy. This has a few
advantages over the exiting copy instruction:

1) It has a special PAGE_COPY mode that claims to be optimised for
page-in/page-out, which is the vast majority of current users.

2) It also has a simple BYTE_COPY mode that supports byte granularity
copying without any restrictions.

With 2) we can now easily skip the bounce buffer flow when copying
buffers with strange sizing/alignment, like for memory_access. But that
is left for the next patch.

v2 (Matt Brost):
- Use device info to check whether device should use the MEM_COPY
path. This should fit better with making this a configfs tunable.
- And with that also keep old path still functional on xe2 for possible
experimentation.
- Add a define for PAGE_COPY page-size.
v3 (Matt Brost):
- Fallback to an actual linear copy for pitch=1.
- Also update NVL.

BSpec: 57561
Signed-off-by: Matthew Auld <matthew.auld@intel.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
Link: https://lore.kernel.org/r/20251022163836.191405-7-matthew.auld@intel.com

+72 -3
+6
drivers/gpu/drm/xe/instructions/xe_gpu_commands.h
··· 31 31 #define XY_FAST_COPY_BLT_D1_DST_TILE4 REG_BIT(30) 32 32 #define XE2_XY_FAST_COPY_BLT_MOCS_INDEX_MASK GENMASK(23, 20) 33 33 34 + #define MEM_COPY_CMD (2 << 29 | 0x5a << 22 | 0x8) 35 + #define MEM_COPY_PAGE_COPY_MODE REG_BIT(19) 36 + #define MEM_COPY_MATRIX_COPY REG_BIT(17) 37 + #define MEM_COPY_SRC_MOCS_INDEX_MASK GENMASK(31, 28) 38 + #define MEM_COPY_DST_MOCS_INDEX_MASK GENMASK(6, 3) 39 + 34 40 #define PVC_MEM_SET_CMD (2 << 29 | 0x5b << 22) 35 41 #define PVC_MEM_SET_CMD_LEN_DW 7 36 42 #define PVC_MEM_SET_MATRIX REG_BIT(17)
+2
drivers/gpu/drm/xe/xe_device_types.h
··· 300 300 * pcode mailbox commands. 301 301 */ 302 302 u8 has_mbx_power_limits:1; 303 + /** @info.has_mem_copy_instr: Device supports MEM_COPY instruction */ 304 + u8 has_mem_copy_instr:1; 303 305 /** @info.has_pxp: Device has PXP support */ 304 306 u8 has_pxp:1; 305 307 /** @info.has_range_tlb_inval: Has range based TLB invalidations */
+58 -3
drivers/gpu/drm/xe/xe_migrate.c
··· 699 699 } 700 700 701 701 #define EMIT_COPY_DW 10 702 - static void emit_copy(struct xe_gt *gt, struct xe_bb *bb, 703 - u64 src_ofs, u64 dst_ofs, unsigned int size, 704 - unsigned int pitch) 702 + static void emit_xy_fast_copy(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs, 703 + u64 dst_ofs, unsigned int size, 704 + unsigned int pitch) 705 705 { 706 706 struct xe_device *xe = gt_to_xe(gt); 707 707 u32 mocs = 0; ··· 728 728 bb->cs[bb->len++] = pitch | mocs; 729 729 bb->cs[bb->len++] = lower_32_bits(src_ofs); 730 730 bb->cs[bb->len++] = upper_32_bits(src_ofs); 731 + } 732 + 733 + #define PAGE_COPY_MODE_PS SZ_256 /* hw uses 256 bytes as the page-size */ 734 + static void emit_mem_copy(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs, 735 + u64 dst_ofs, unsigned int size, unsigned int pitch) 736 + { 737 + u32 mode, copy_type, width; 738 + 739 + xe_gt_assert(gt, IS_ALIGNED(size, pitch)); 740 + xe_gt_assert(gt, pitch <= U16_MAX); 741 + xe_gt_assert(gt, pitch); 742 + xe_gt_assert(gt, size); 743 + 744 + if (IS_ALIGNED(size, PAGE_COPY_MODE_PS) && 745 + IS_ALIGNED(lower_32_bits(src_ofs), PAGE_COPY_MODE_PS) && 746 + IS_ALIGNED(lower_32_bits(dst_ofs), PAGE_COPY_MODE_PS)) { 747 + mode = MEM_COPY_PAGE_COPY_MODE; 748 + copy_type = 0; /* linear copy */ 749 + width = size / PAGE_COPY_MODE_PS; 750 + } else if (pitch > 1) { 751 + xe_gt_assert(gt, size / pitch <= U16_MAX); 752 + mode = 0; /* BYTE_COPY */ 753 + copy_type = MEM_COPY_MATRIX_COPY; 754 + width = pitch; 755 + } else { 756 + mode = 0; /* BYTE_COPY */ 757 + copy_type = 0; /* linear copy */ 758 + width = size; 759 + } 760 + 761 + xe_gt_assert(gt, width <= U16_MAX); 762 + 763 + bb->cs[bb->len++] = MEM_COPY_CMD | mode | copy_type; 764 + bb->cs[bb->len++] = width - 1; 765 + bb->cs[bb->len++] = size / pitch - 1; /* ignored by hw for page-copy/linear above */ 766 + bb->cs[bb->len++] = pitch - 1; 767 + bb->cs[bb->len++] = pitch - 1; 768 + bb->cs[bb->len++] = lower_32_bits(src_ofs); 769 + bb->cs[bb->len++] = upper_32_bits(src_ofs); 770 + bb->cs[bb->len++] = lower_32_bits(dst_ofs); 771 + bb->cs[bb->len++] = upper_32_bits(dst_ofs); 772 + bb->cs[bb->len++] = FIELD_PREP(MEM_COPY_SRC_MOCS_INDEX_MASK, gt->mocs.uc_index) | 773 + FIELD_PREP(MEM_COPY_DST_MOCS_INDEX_MASK, gt->mocs.uc_index); 774 + } 775 + 776 + static void emit_copy(struct xe_gt *gt, struct xe_bb *bb, 777 + u64 src_ofs, u64 dst_ofs, unsigned int size, 778 + unsigned int pitch) 779 + { 780 + struct xe_device *xe = gt_to_xe(gt); 781 + 782 + if (xe->info.has_mem_copy_instr) 783 + emit_mem_copy(gt, bb, src_ofs, dst_ofs, size, pitch); 784 + else 785 + emit_xy_fast_copy(gt, bb, src_ofs, dst_ofs, size, pitch); 731 786 } 732 787 733 788 static u64 xe_migrate_batch_base(struct xe_migrate *m, bool usm)
+5
drivers/gpu/drm/xe/xe_pci.c
··· 342 342 .has_display = true, 343 343 .has_flat_ccs = 1, 344 344 .has_pxp = true, 345 + .has_mem_copy_instr = true, 345 346 .max_gt_per_tile = 2, 346 347 .needs_scratch = true, 347 348 .va_bits = 48, ··· 363 362 .has_heci_cscfi = 1, 364 363 .has_late_bind = true, 365 364 .has_sriov = true, 365 + .has_mem_copy_instr = true, 366 366 .max_gt_per_tile = 2, 367 367 .needs_scratch = true, 368 368 .subplatforms = (const struct xe_subplatform_desc[]) { ··· 380 378 .has_display = true, 381 379 .has_flat_ccs = 1, 382 380 .has_sriov = true, 381 + .has_mem_copy_instr = true, 383 382 .max_gt_per_tile = 2, 384 383 .needs_scratch = true, 385 384 .needs_shared_vf_gt_wq = true, ··· 393 390 .dma_mask_size = 46, 394 391 .has_display = true, 395 392 .has_flat_ccs = 1, 393 + .has_mem_copy_instr = true, 396 394 .max_gt_per_tile = 2, 397 395 .require_force_probe = true, 398 396 .va_bits = 48, ··· 659 655 xe->info.has_pxp = desc->has_pxp; 660 656 xe->info.has_sriov = xe_configfs_primary_gt_allowed(to_pci_dev(xe->drm.dev)) && 661 657 desc->has_sriov; 658 + xe->info.has_mem_copy_instr = desc->has_mem_copy_instr; 662 659 xe->info.skip_guc_pc = desc->skip_guc_pc; 663 660 xe->info.skip_mtcfg = desc->skip_mtcfg; 664 661 xe->info.skip_pcode = desc->skip_pcode;
+1
drivers/gpu/drm/xe/xe_pci_types.h
··· 46 46 u8 has_late_bind:1; 47 47 u8 has_llc:1; 48 48 u8 has_mbx_power_limits:1; 49 + u8 has_mem_copy_instr:1; 49 50 u8 has_pxp:1; 50 51 u8 has_sriov:1; 51 52 u8 needs_scratch:1;