Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/xe: Add more SVM GT stats

Add more SVM GT stats which give visibility to where time is spent in
the SVM page fault handler. Stats include number of faults at a given
size, total SVM page fault time, migration time in us, copy time in us,
copy kb, get pages time in us, and bind time in us. Will help in tuning
SVM for performance.

v2:
- Include local changes
v3:
- Add tlb invalidation + valid page fault + per size copy size stats
v4:
- Ensure gt not NULL when incrementing SVM copy stats
- Normalize stats names
- Use magic macros to generate increment functions for ranges
v7:
- Use DEF_STAT_STR (Michal)

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: Francois Dugast <francois.dugast@intel.com>
Link: https://lore.kernel.org/r/20250829172232.1308004-3-matthew.brost@intel.com

+257 -13
+39 -4
drivers/gpu/drm/xe/xe_gt_stats.c
··· 26 26 atomic64_add(incr, &gt->stats.counters[id]); 27 27 } 28 28 29 + #define DEF_STAT_STR(ID, name) [XE_GT_STATS_ID_##ID] = name 30 + 29 31 static const char *const stat_description[__XE_GT_STATS_NUM_IDS] = { 30 - "svm_pagefault_count", 31 - "tlb_inval_count", 32 - "vma_pagefault_count", 33 - "vma_pagefault_kb", 32 + DEF_STAT_STR(SVM_PAGEFAULT_COUNT, "svm_pagefault_count"), 33 + DEF_STAT_STR(TLB_INVAL, "tlb_inval_count"), 34 + DEF_STAT_STR(SVM_TLB_INVAL_COUNT, "svm_tlb_inval_count"), 35 + DEF_STAT_STR(SVM_TLB_INVAL_US, "svm_tlb_inval_us"), 36 + DEF_STAT_STR(VMA_PAGEFAULT_COUNT, "vma_pagefault_count"), 37 + DEF_STAT_STR(VMA_PAGEFAULT_KB, "vma_pagefault_kb"), 38 + DEF_STAT_STR(SVM_4K_PAGEFAULT_COUNT, "svm_4K_pagefault_count"), 39 + DEF_STAT_STR(SVM_64K_PAGEFAULT_COUNT, "svm_64K_pagefault_count"), 40 + DEF_STAT_STR(SVM_2M_PAGEFAULT_COUNT, "svm_2M_pagefault_count"), 41 + DEF_STAT_STR(SVM_4K_VALID_PAGEFAULT_COUNT, "svm_4K_valid_pagefault_count"), 42 + DEF_STAT_STR(SVM_64K_VALID_PAGEFAULT_COUNT, "svm_64K_valid_pagefault_count"), 43 + DEF_STAT_STR(SVM_2M_VALID_PAGEFAULT_COUNT, "svm_2M_valid_pagefault_count"), 44 + DEF_STAT_STR(SVM_4K_PAGEFAULT_US, "svm_4K_pagefault_us"), 45 + DEF_STAT_STR(SVM_64K_PAGEFAULT_US, "svm_64K_pagefault_us"), 46 + DEF_STAT_STR(SVM_2M_PAGEFAULT_US, "svm_2M_pagefault_us"), 47 + DEF_STAT_STR(SVM_4K_MIGRATE_COUNT, "svm_4K_migrate_count"), 48 + DEF_STAT_STR(SVM_64K_MIGRATE_COUNT, "svm_64K_migrate_count"), 49 + DEF_STAT_STR(SVM_2M_MIGRATE_COUNT, "svm_2M_migrate_count"), 50 + DEF_STAT_STR(SVM_4K_MIGRATE_US, "svm_4K_migrate_us"), 51 + DEF_STAT_STR(SVM_64K_MIGRATE_US, "svm_64K_migrate_us"), 52 + DEF_STAT_STR(SVM_2M_MIGRATE_US, "svm_2M_migrate_us"), 53 + DEF_STAT_STR(SVM_DEVICE_COPY_US, "svm_device_copy_us"), 54 + DEF_STAT_STR(SVM_4K_DEVICE_COPY_US, "svm_4K_device_copy_us"), 55 + DEF_STAT_STR(SVM_64K_DEVICE_COPY_US, "svm_64K_device_copy_us"), 56 + DEF_STAT_STR(SVM_2M_DEVICE_COPY_US, "svm_2M_device_copy_us"), 57 + DEF_STAT_STR(SVM_CPU_COPY_US, "svm_cpu_copy_us"), 58 + DEF_STAT_STR(SVM_4K_CPU_COPY_US, "svm_4K_cpu_copy_us"), 59 + DEF_STAT_STR(SVM_64K_CPU_COPY_US, "svm_64K_cpu_copy_us"), 60 + DEF_STAT_STR(SVM_2M_CPU_COPY_US, "svm_2M_cpu_copy_us"), 61 + DEF_STAT_STR(SVM_DEVICE_COPY_KB, "svm_device_copy_kb"), 62 + DEF_STAT_STR(SVM_CPU_COPY_KB, "svm_cpu_copy_kb"), 63 + DEF_STAT_STR(SVM_4K_GET_PAGES_US, "svm_4K_get_pages_us"), 64 + DEF_STAT_STR(SVM_64K_GET_PAGES_US, "svm_64K_get_pages_us"), 65 + DEF_STAT_STR(SVM_2M_GET_PAGES_US, "svm_2M_get_pages_us"), 66 + DEF_STAT_STR(SVM_4K_BIND_US, "svm_4K_bind_us"), 67 + DEF_STAT_STR(SVM_64K_BIND_US, "svm_64K_bind_us"), 68 + DEF_STAT_STR(SVM_2M_BIND_US, "svm_2M_bind_us"), 34 69 }; 35 70 36 71 /**
+33
drivers/gpu/drm/xe/xe_gt_stats_types.h
··· 9 9 enum xe_gt_stats_id { 10 10 XE_GT_STATS_ID_SVM_PAGEFAULT_COUNT, 11 11 XE_GT_STATS_ID_TLB_INVAL, 12 + XE_GT_STATS_ID_SVM_TLB_INVAL_COUNT, 13 + XE_GT_STATS_ID_SVM_TLB_INVAL_US, 12 14 XE_GT_STATS_ID_VMA_PAGEFAULT_COUNT, 13 15 XE_GT_STATS_ID_VMA_PAGEFAULT_KB, 16 + XE_GT_STATS_ID_SVM_4K_PAGEFAULT_COUNT, 17 + XE_GT_STATS_ID_SVM_64K_PAGEFAULT_COUNT, 18 + XE_GT_STATS_ID_SVM_2M_PAGEFAULT_COUNT, 19 + XE_GT_STATS_ID_SVM_4K_VALID_PAGEFAULT_COUNT, 20 + XE_GT_STATS_ID_SVM_64K_VALID_PAGEFAULT_COUNT, 21 + XE_GT_STATS_ID_SVM_2M_VALID_PAGEFAULT_COUNT, 22 + XE_GT_STATS_ID_SVM_4K_PAGEFAULT_US, 23 + XE_GT_STATS_ID_SVM_64K_PAGEFAULT_US, 24 + XE_GT_STATS_ID_SVM_2M_PAGEFAULT_US, 25 + XE_GT_STATS_ID_SVM_4K_MIGRATE_COUNT, 26 + XE_GT_STATS_ID_SVM_64K_MIGRATE_COUNT, 27 + XE_GT_STATS_ID_SVM_2M_MIGRATE_COUNT, 28 + XE_GT_STATS_ID_SVM_4K_MIGRATE_US, 29 + XE_GT_STATS_ID_SVM_64K_MIGRATE_US, 30 + XE_GT_STATS_ID_SVM_2M_MIGRATE_US, 31 + XE_GT_STATS_ID_SVM_DEVICE_COPY_US, 32 + XE_GT_STATS_ID_SVM_4K_DEVICE_COPY_US, 33 + XE_GT_STATS_ID_SVM_64K_DEVICE_COPY_US, 34 + XE_GT_STATS_ID_SVM_2M_DEVICE_COPY_US, 35 + XE_GT_STATS_ID_SVM_CPU_COPY_US, 36 + XE_GT_STATS_ID_SVM_4K_CPU_COPY_US, 37 + XE_GT_STATS_ID_SVM_64K_CPU_COPY_US, 38 + XE_GT_STATS_ID_SVM_2M_CPU_COPY_US, 39 + XE_GT_STATS_ID_SVM_DEVICE_COPY_KB, 40 + XE_GT_STATS_ID_SVM_CPU_COPY_KB, 41 + XE_GT_STATS_ID_SVM_4K_GET_PAGES_US, 42 + XE_GT_STATS_ID_SVM_64K_GET_PAGES_US, 43 + XE_GT_STATS_ID_SVM_2M_GET_PAGES_US, 44 + XE_GT_STATS_ID_SVM_4K_BIND_US, 45 + XE_GT_STATS_ID_SVM_64K_BIND_US, 46 + XE_GT_STATS_ID_SVM_2M_BIND_US, 14 47 /* must be the last entry */ 15 48 __XE_GT_STATS_NUM_IDS, 16 49 };
+185 -9
drivers/gpu/drm/xe/xe_svm.c
··· 6 6 #include <drm/drm_drv.h> 7 7 8 8 #include "xe_bo.h" 9 + #include "xe_exec_queue_types.h" 9 10 #include "xe_gt_stats.h" 10 11 #include "xe_migrate.h" 11 12 #include "xe_module.h" ··· 113 112 &vm->svm.garbage_collector.work); 114 113 } 115 114 115 + static void xe_svm_tlb_inval_count_stats_incr(struct xe_gt *gt) 116 + { 117 + xe_gt_stats_incr(gt, XE_GT_STATS_ID_SVM_TLB_INVAL_COUNT, 1); 118 + } 119 + 116 120 static u8 117 121 xe_svm_range_notifier_event_begin(struct xe_vm *vm, struct drm_gpusvm_range *r, 118 122 const struct mmu_notifier_range *mmu_range, ··· 150 144 */ 151 145 for_each_tile(tile, xe, id) 152 146 if (xe_pt_zap_ptes_range(tile, vm, range)) { 153 - tile_mask |= BIT(id); 154 147 /* 155 148 * WRITE_ONCE pairs with READ_ONCE in 156 149 * xe_vm_has_valid_gpu_mapping() 157 150 */ 158 151 WRITE_ONCE(range->tile_invalidated, 159 152 range->tile_invalidated | BIT(id)); 153 + 154 + if (!(tile_mask & BIT(id))) { 155 + xe_svm_tlb_inval_count_stats_incr(tile->primary_gt); 156 + if (tile->media_gt) 157 + xe_svm_tlb_inval_count_stats_incr(tile->media_gt); 158 + tile_mask |= BIT(id); 159 + } 160 160 } 161 161 162 162 return tile_mask; ··· 182 170 mmu_range); 183 171 } 184 172 173 + static s64 xe_svm_stats_ktime_us_delta(ktime_t start) 174 + { 175 + return IS_ENABLED(CONFIG_DEBUG_FS) ? 176 + ktime_us_delta(ktime_get(), start) : 0; 177 + } 178 + 179 + static void xe_svm_tlb_inval_us_stats_incr(struct xe_gt *gt, ktime_t start) 180 + { 181 + s64 us_delta = xe_svm_stats_ktime_us_delta(start); 182 + 183 + xe_gt_stats_incr(gt, XE_GT_STATS_ID_SVM_TLB_INVAL_US, us_delta); 184 + } 185 + 186 + static ktime_t xe_svm_stats_ktime_get(void) 187 + { 188 + return IS_ENABLED(CONFIG_DEBUG_FS) ? ktime_get() : 0; 189 + } 190 + 185 191 static void xe_svm_invalidate(struct drm_gpusvm *gpusvm, 186 192 struct drm_gpusvm_notifier *notifier, 187 193 const struct mmu_notifier_range *mmu_range) ··· 207 177 struct xe_vm *vm = gpusvm_to_vm(gpusvm); 208 178 struct xe_device *xe = vm->xe; 209 179 struct drm_gpusvm_range *r, *first; 180 + struct xe_tile *tile; 181 + ktime_t start = xe_svm_stats_ktime_get(); 210 182 u64 adj_start = mmu_range->start, adj_end = mmu_range->end; 211 - u8 tile_mask = 0; 183 + u8 tile_mask = 0, id; 212 184 long err; 213 185 214 186 xe_svm_assert_in_notifier(vm); ··· 263 231 r = first; 264 232 drm_gpusvm_for_each_range(r, notifier, adj_start, adj_end) 265 233 xe_svm_range_notifier_event_end(vm, r, mmu_range); 234 + for_each_tile(tile, xe, id) { 235 + if (tile_mask & BIT(id)) { 236 + xe_svm_tlb_inval_us_stats_incr(tile->primary_gt, start); 237 + if (tile->media_gt) 238 + xe_svm_tlb_inval_us_stats_incr(tile->media_gt, start); 239 + } 240 + } 266 241 } 267 242 268 243 static int __xe_svm_garbage_collector(struct xe_vm *vm, ··· 423 384 XE_SVM_COPY_TO_SRAM, 424 385 }; 425 386 387 + static void xe_svm_copy_kb_stats_incr(struct xe_gt *gt, 388 + const enum xe_svm_copy_dir dir, 389 + int kb) 390 + { 391 + if (dir == XE_SVM_COPY_TO_VRAM) 392 + xe_gt_stats_incr(gt, XE_GT_STATS_ID_SVM_DEVICE_COPY_KB, kb); 393 + else 394 + xe_gt_stats_incr(gt, XE_GT_STATS_ID_SVM_CPU_COPY_KB, kb); 395 + } 396 + 397 + static void xe_svm_copy_us_stats_incr(struct xe_gt *gt, 398 + const enum xe_svm_copy_dir dir, 399 + unsigned long npages, 400 + ktime_t start) 401 + { 402 + s64 us_delta = xe_svm_stats_ktime_us_delta(start); 403 + 404 + if (dir == XE_SVM_COPY_TO_VRAM) { 405 + switch (npages) { 406 + case 1: 407 + xe_gt_stats_incr(gt, XE_GT_STATS_ID_SVM_4K_DEVICE_COPY_US, 408 + us_delta); 409 + break; 410 + case 16: 411 + xe_gt_stats_incr(gt, XE_GT_STATS_ID_SVM_64K_DEVICE_COPY_US, 412 + us_delta); 413 + break; 414 + case 512: 415 + xe_gt_stats_incr(gt, XE_GT_STATS_ID_SVM_2M_DEVICE_COPY_US, 416 + us_delta); 417 + break; 418 + } 419 + xe_gt_stats_incr(gt, XE_GT_STATS_ID_SVM_DEVICE_COPY_US, 420 + us_delta); 421 + } else { 422 + switch (npages) { 423 + case 1: 424 + xe_gt_stats_incr(gt, XE_GT_STATS_ID_SVM_4K_CPU_COPY_US, 425 + us_delta); 426 + break; 427 + case 16: 428 + xe_gt_stats_incr(gt, XE_GT_STATS_ID_SVM_64K_CPU_COPY_US, 429 + us_delta); 430 + break; 431 + case 512: 432 + xe_gt_stats_incr(gt, XE_GT_STATS_ID_SVM_2M_CPU_COPY_US, 433 + us_delta); 434 + break; 435 + } 436 + xe_gt_stats_incr(gt, XE_GT_STATS_ID_SVM_CPU_COPY_US, 437 + us_delta); 438 + } 439 + } 440 + 426 441 static int xe_svm_copy(struct page **pages, 427 442 struct drm_pagemap_addr *pagemap_addr, 428 443 unsigned long npages, const enum xe_svm_copy_dir dir) 429 444 { 430 445 struct xe_vram_region *vr = NULL; 446 + struct xe_gt *gt = NULL; 431 447 struct xe_device *xe; 432 448 struct dma_fence *fence = NULL; 433 449 unsigned long i; ··· 490 396 u64 vram_addr = XE_VRAM_ADDR_INVALID; 491 397 int err = 0, pos = 0; 492 398 bool sram = dir == XE_SVM_COPY_TO_SRAM; 399 + ktime_t start = xe_svm_stats_ktime_get(); 493 400 494 401 /* 495 402 * This flow is complex: it locates physically contiguous device pages, ··· 517 422 518 423 if (!vr && spage) { 519 424 vr = page_to_vr(spage); 425 + gt = xe_migrate_exec_queue(vr->migrate)->gt; 520 426 xe = vr->xe; 521 427 } 522 428 XE_WARN_ON(spage && page_to_vr(spage) != vr); ··· 557 461 int incr = (match && last) ? 1 : 0; 558 462 559 463 if (vram_addr != XE_VRAM_ADDR_INVALID) { 464 + xe_svm_copy_kb_stats_incr(gt, dir, 465 + (i - pos + incr) * 466 + (PAGE_SIZE / SZ_1K)); 560 467 if (sram) { 561 468 vm_dbg(&xe->drm, 562 469 "COPY TO SRAM - 0x%016llx -> 0x%016llx, NPAGES=%ld", ··· 598 499 599 500 /* Extra mismatched device page, copy it */ 600 501 if (!match && last && vram_addr != XE_VRAM_ADDR_INVALID) { 502 + xe_svm_copy_kb_stats_incr(gt, dir, 503 + (PAGE_SIZE / SZ_1K)); 601 504 if (sram) { 602 505 vm_dbg(&xe->drm, 603 506 "COPY TO SRAM - 0x%016llx -> 0x%016llx, NPAGES=%d", ··· 632 531 dma_fence_wait(fence, false); 633 532 dma_fence_put(fence); 634 533 } 534 + 535 + /* 536 + * XXX: We can't derive the GT here (or anywhere in this functions, but 537 + * compute always uses the primary GT so accumlate stats on the likely 538 + * GT of the fault. 539 + */ 540 + if (gt) 541 + xe_svm_copy_us_stats_incr(gt, dir, npages, start); 635 542 636 543 return err; 637 544 #undef XE_MIGRATE_CHUNK_SIZE ··· 954 845 return true; 955 846 } 956 847 848 + #define DECL_SVM_RANGE_COUNT_STATS(elem, stat) \ 849 + static void xe_svm_range_##elem##_count_stats_incr(struct xe_gt *gt, \ 850 + struct xe_svm_range *range) \ 851 + { \ 852 + switch (xe_svm_range_size(range)) { \ 853 + case SZ_4K: \ 854 + xe_gt_stats_incr(gt, XE_GT_STATS_ID_SVM_4K_##stat##_COUNT, 1); \ 855 + break; \ 856 + case SZ_64K: \ 857 + xe_gt_stats_incr(gt, XE_GT_STATS_ID_SVM_64K_##stat##_COUNT, 1); \ 858 + break; \ 859 + case SZ_2M: \ 860 + xe_gt_stats_incr(gt, XE_GT_STATS_ID_SVM_2M_##stat##_COUNT, 1); \ 861 + break; \ 862 + } \ 863 + } \ 864 + 865 + DECL_SVM_RANGE_COUNT_STATS(fault, PAGEFAULT) 866 + DECL_SVM_RANGE_COUNT_STATS(valid_fault, VALID_PAGEFAULT) 867 + DECL_SVM_RANGE_COUNT_STATS(migrate, MIGRATE) 868 + 869 + #define DECL_SVM_RANGE_US_STATS(elem, stat) \ 870 + static void xe_svm_range_##elem##_us_stats_incr(struct xe_gt *gt, \ 871 + struct xe_svm_range *range, \ 872 + ktime_t start) \ 873 + { \ 874 + s64 us_delta = xe_svm_stats_ktime_us_delta(start); \ 875 + \ 876 + switch (xe_svm_range_size(range)) { \ 877 + case SZ_4K: \ 878 + xe_gt_stats_incr(gt, XE_GT_STATS_ID_SVM_4K_##stat##_US, \ 879 + us_delta); \ 880 + break; \ 881 + case SZ_64K: \ 882 + xe_gt_stats_incr(gt, XE_GT_STATS_ID_SVM_64K_##stat##_US, \ 883 + us_delta); \ 884 + break; \ 885 + case SZ_2M: \ 886 + xe_gt_stats_incr(gt, XE_GT_STATS_ID_SVM_2M_##stat##_US, \ 887 + us_delta); \ 888 + break; \ 889 + } \ 890 + } \ 891 + 892 + DECL_SVM_RANGE_US_STATS(migrate, MIGRATE) 893 + DECL_SVM_RANGE_US_STATS(get_pages, GET_PAGES) 894 + DECL_SVM_RANGE_US_STATS(bind, BIND) 895 + DECL_SVM_RANGE_US_STATS(fault, PAGEFAULT) 896 + 957 897 static int __xe_svm_handle_pagefault(struct xe_vm *vm, struct xe_vma *vma, 958 898 struct xe_gt *gt, u64 fault_addr, 959 899 bool need_vram) ··· 1024 866 struct xe_tile *tile = gt_to_tile(gt); 1025 867 int migrate_try_count = ctx.devmem_only ? 3 : 1; 1026 868 ktime_t end = 0; 869 + ktime_t start = xe_svm_stats_ktime_get(), bind_start, get_pages_start; 1027 870 int err; 1028 871 1029 872 lockdep_assert_held_write(&vm->lock); ··· 1043 884 if (IS_ERR(range)) 1044 885 return PTR_ERR(range); 1045 886 1046 - if (ctx.devmem_only && !range->base.flags.migrate_devmem) 1047 - return -EACCES; 887 + xe_svm_range_fault_count_stats_incr(gt, range); 1048 888 1049 - if (xe_svm_range_is_valid(range, tile, ctx.devmem_only)) 1050 - return 0; 889 + if (ctx.devmem_only && !range->base.flags.migrate_devmem) { 890 + err = -EACCES; 891 + goto out; 892 + } 893 + 894 + if (xe_svm_range_is_valid(range, tile, ctx.devmem_only)) { 895 + xe_svm_range_valid_fault_count_stats_incr(gt, range); 896 + range_debug(range, "PAGE FAULT - VALID"); 897 + goto out; 898 + } 1051 899 1052 900 range_debug(range, "PAGE FAULT"); 1053 901 1054 902 dpagemap = xe_vma_resolve_pagemap(vma, tile); 1055 903 if (--migrate_try_count >= 0 && 1056 904 xe_svm_range_needs_migrate_to_vram(range, vma, !!dpagemap || ctx.devmem_only)) { 905 + ktime_t migrate_start = xe_svm_stats_ktime_get(); 906 + 1057 907 /* TODO : For multi-device dpagemap will be used to find the 1058 908 * remote tile and remote device. Will need to modify 1059 909 * xe_svm_alloc_vram to use dpagemap for future multi-device 1060 910 * support. 1061 911 */ 912 + xe_svm_range_migrate_count_stats_incr(gt, range); 1062 913 err = xe_svm_alloc_vram(tile, range, &ctx); 914 + xe_svm_range_migrate_us_stats_incr(gt, range, migrate_start); 1063 915 ctx.timeslice_ms <<= 1; /* Double timeslice if we have to retry */ 1064 916 if (err) { 1065 917 if (migrate_try_count || !ctx.devmem_only) { ··· 1086 916 } 1087 917 } 1088 918 } 919 + 920 + get_pages_start = xe_svm_stats_ktime_get(); 1089 921 1090 922 range_debug(range, "GET PAGES"); 1091 923 err = xe_svm_range_get_pages(vm, range, &ctx); ··· 1108 936 } 1109 937 if (err) { 1110 938 range_debug(range, "PAGE FAULT - FAIL PAGE COLLECT"); 1111 - goto err_out; 939 + goto out; 1112 940 } 1113 941 942 + xe_svm_range_get_pages_us_stats_incr(gt, range, get_pages_start); 1114 943 range_debug(range, "PAGE FAULT - BIND"); 1115 944 945 + bind_start = xe_svm_stats_ktime_get(); 1116 946 retry_bind: 1117 947 xe_vm_lock(vm, false); 1118 948 fence = xe_vm_range_rebind(vm, vma, range, BIT(tile->id)); ··· 1128 954 } 1129 955 if (xe_vm_validate_should_retry(NULL, err, &end)) 1130 956 goto retry_bind; 1131 - goto err_out; 957 + goto out; 1132 958 } 1133 959 xe_vm_unlock(vm); 1134 960 1135 961 dma_fence_wait(fence, false); 1136 962 dma_fence_put(fence); 963 + xe_svm_range_bind_us_stats_incr(gt, range, bind_start); 1137 964 1138 - err_out: 965 + out: 966 + xe_svm_range_fault_us_stats_incr(gt, range, start); 1139 967 1140 968 return err; 1141 969 }