Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdkfd: Output migrate end event if migrate failed

If page migration failed, also output migrate end event to match with
migrate start event, with failure error_code added to the end of the
migrate message macro. This will not break uAPI because application uses
old message macro sscanf drop and ignore the error_code.

Output GPU page fault restore end event if migration failed.

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
Reviewed-by: James Zhu <James.Zhu@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Philip Yang and committed by
Alex Deucher
dad6c45c 8fc279e5

+19 -17
+6 -8
drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
··· 445 445 pr_debug("successful/cpages/npages 0x%lx/0x%lx/0x%lx\n", 446 446 mpages, cpages, migrate.npages); 447 447 448 - kfd_smi_event_migration_end(node, p->lead_thread->pid, 449 - start >> PAGE_SHIFT, end >> PAGE_SHIFT, 450 - 0, node->id, trigger); 451 - 452 448 svm_range_dma_unmap_dev(adev->dev, scratch, 0, npages); 453 449 454 450 out_free: 455 451 kvfree(buf); 452 + kfd_smi_event_migration_end(node, p->lead_thread->pid, 453 + start >> PAGE_SHIFT, end >> PAGE_SHIFT, 454 + 0, node->id, trigger, r); 456 455 out: 457 456 if (!r && mpages) { 458 457 pdd = svm_range_get_pdd_by_node(prange, node); ··· 750 751 svm_migrate_copy_done(adev, mfence); 751 752 migrate_vma_finalize(&migrate); 752 753 753 - kfd_smi_event_migration_end(node, p->lead_thread->pid, 754 - start >> PAGE_SHIFT, end >> PAGE_SHIFT, 755 - node->id, 0, trigger); 756 - 757 754 svm_range_dma_unmap_dev(adev->dev, scratch, 0, npages); 758 755 759 756 out_free: 760 757 kvfree(buf); 758 + kfd_smi_event_migration_end(node, p->lead_thread->pid, 759 + start >> PAGE_SHIFT, end >> PAGE_SHIFT, 760 + node->id, 0, trigger, r); 761 761 out: 762 762 if (!r && cpages) { 763 763 mpages = cpages - upages;
+3 -2
drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
··· 292 292 293 293 void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid, 294 294 unsigned long start, unsigned long end, 295 - uint32_t from, uint32_t to, uint32_t trigger) 295 + uint32_t from, uint32_t to, uint32_t trigger, 296 + int error_code) 296 297 { 297 298 kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_END, 298 299 KFD_EVENT_FMT_MIGRATE_END( 299 300 ktime_get_boottime_ns(), pid, start, end - start, 300 - from, to, trigger)); 301 + from, to, trigger, error_code)); 301 302 } 302 303 303 304 void kfd_smi_event_queue_eviction(struct kfd_node *node, pid_t pid,
+2 -1
drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
··· 44 44 uint32_t trigger); 45 45 void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid, 46 46 unsigned long start, unsigned long end, 47 - uint32_t from, uint32_t to, uint32_t trigger); 47 + uint32_t from, uint32_t to, uint32_t trigger, 48 + int error_code); 48 49 void kfd_smi_event_queue_eviction(struct kfd_node *node, pid_t pid, 49 50 uint32_t trigger); 50 51 void kfd_smi_event_queue_restore(struct kfd_node *node, pid_t pid);
+4 -3
drivers/gpu/drm/amd/amdkfd/kfd_svm.c
··· 3085 3085 start = max_t(unsigned long, ALIGN_DOWN(addr, size), prange->start); 3086 3086 last = min_t(unsigned long, ALIGN(addr + 1, size) - 1, prange->last); 3087 3087 if (prange->actual_loc != 0 || best_loc != 0) { 3088 - migration = true; 3089 - 3090 3088 if (best_loc) { 3091 3089 r = svm_migrate_to_vram(prange, best_loc, start, last, 3092 3090 mm, KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU); ··· 3107 3109 if (r) { 3108 3110 pr_debug("failed %d to migrate svms %p [0x%lx 0x%lx]\n", 3109 3111 r, svms, start, last); 3110 - goto out_unlock_range; 3112 + goto out_migrate_fail; 3113 + } else { 3114 + migration = true; 3111 3115 } 3112 3116 } 3113 3117 ··· 3119 3119 pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpus\n", 3120 3120 r, svms, start, last); 3121 3121 3122 + out_migrate_fail: 3122 3123 kfd_smi_event_page_fault_end(node, p->lead_thread->pid, addr, 3123 3124 migration); 3124 3125
+4 -3
include/uapi/linux/kfd_ioctl.h
··· 609 609 * migrate_update: GPU page fault is recovered by 'M' for migrate, 'U' for update 610 610 * rw: 'W' for write page fault, 'R' for read page fault 611 611 * rescheduled: 'R' if the queue restore failed and rescheduled to try again 612 + * error_code: migrate failure error code, 0 if no error 612 613 */ 613 614 #define KFD_EVENT_FMT_UPDATE_GPU_RESET(reset_seq_num, reset_cause)\ 614 615 "%x %s\n", (reset_seq_num), (reset_cause) ··· 631 630 "%lld -%d @%lx(%lx) %x->%x %x:%x %d\n", (ns), (pid), (start), (size),\ 632 631 (from), (to), (prefetch_loc), (preferred_loc), (migrate_trigger) 633 632 634 - #define KFD_EVENT_FMT_MIGRATE_END(ns, pid, start, size, from, to, migrate_trigger)\ 635 - "%lld -%d @%lx(%lx) %x->%x %d\n", (ns), (pid), (start), (size),\ 636 - (from), (to), (migrate_trigger) 633 + #define KFD_EVENT_FMT_MIGRATE_END(ns, pid, start, size, from, to, migrate_trigger, error_code) \ 634 + "%lld -%d @%lx(%lx) %x->%x %d %d\n", (ns), (pid), (start), (size),\ 635 + (from), (to), (migrate_trigger), (error_code) 637 636 638 637 #define KFD_EVENT_FMT_QUEUE_EVICTION(ns, pid, node, evict_trigger)\ 639 638 "%lld -%d %x %d\n", (ns), (pid), (node), (evict_trigger)