Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/xe: fix pvc unload issue

Currently, unload pvc driver will generate a null dereference
and the call stack is as below.

[ 4850.618000] Call Trace:
[ 4850.620740] <TASK>
[ 4850.623134] ttm_bo_cleanup_memtype_use+0x3f/0x50 [ttm]
[ 4850.628661] ttm_bo_release+0x154/0x2c0 [ttm]
[ 4850.633317] ? drm_buddy_fini+0x62/0x80 [drm_buddy]
[ 4850.638487] ? __kmem_cache_free+0x27d/0x2c0
[ 4850.643054] ttm_bo_put+0x38/0x60 [ttm]
[ 4850.647190] xe_gem_object_free+0x1f/0x30 [xe]
[ 4850.651945] drm_gem_object_free+0x1e/0x30 [drm]
[ 4850.656904] ggtt_fini_noalloc+0x9d/0xe0 [xe]
[ 4850.661574] drm_managed_release+0xb5/0x150 [drm]
[ 4850.666617] drm_dev_release+0x30/0x50 [drm]
[ 4850.671209] devm_drm_dev_init_release+0x3c/0x60 [drm]

There are a couple issues, but the main one is due to TTM has only
one TTM_PL_TT region, but since pvc has 2 tiles and tries to setup
1 TTM_PL_TT each tile. The second will overwrite the first one.

During unload time, the first tile will reset the TTM_PL_TT manger
and when the second tile is trying to free Bo and it will generate
the null reference since the TTM manage is already got reset to 0.

The fix is to use one global TTM_PL_TT manager.

v2: make gtt mgr global and change the name to sys_mgr

Cc: Stuart Summers <stuart.summers@intel.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Vivi, Rodrigo <rodrigo.vivi@intel.com>
Signed-off-by: Bruce Chang <yu.bruce.chang@intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>

authored by

Chang, Bruce and committed by
Rodrigo Vivi
1a545ed7 96578d10

+47 -97
+1 -1
drivers/gpu/drm/xe/Makefile
··· 88 88 xe_step.o \ 89 89 xe_sync.o \ 90 90 xe_trace.o \ 91 - xe_ttm_gtt_mgr.o \ 91 + xe_ttm_sys_mgr.o \ 92 92 xe_ttm_stolen_mgr.o \ 93 93 xe_ttm_vram_mgr.o \ 94 94 xe_tuning.o \
+3
drivers/gpu/drm/xe/xe_device.c
··· 27 27 #include "xe_pm.h" 28 28 #include "xe_query.h" 29 29 #include "xe_ttm_stolen_mgr.h" 30 + #include "xe_ttm_sys_mgr.h" 30 31 #include "xe_vm.h" 31 32 #include "xe_vm_madvise.h" 32 33 #include "xe_wait_user_fence.h" ··· 262 261 err = xe_mmio_probe_vram(xe); 263 262 if (err) 264 263 goto err_irq_shutdown; 264 + 265 + xe_ttm_sys_mgr_init(xe); 265 266 266 267 for_each_gt(gt, xe, id) { 267 268 err = xe_gt_init_noalloc(gt);
+1
drivers/gpu/drm/xe/xe_device.h
··· 116 116 } 117 117 118 118 u32 xe_device_ccs_bytes(struct xe_device *xe, u64 size); 119 + 119 120 #endif
+2
drivers/gpu/drm/xe/xe_device_types.h
··· 134 134 /** @mapping: pointer to VRAM mappable space */ 135 135 void *__iomem mapping; 136 136 } vram; 137 + /** @sys_mgr: system TTM manager */ 138 + struct ttm_resource_manager sys_mgr; 137 139 } mem; 138 140 139 141 /** @usm: unified memory state */
-18
drivers/gpu/drm/xe/xe_gt.c
··· 36 36 #include "xe_ring_ops.h" 37 37 #include "xe_sa.h" 38 38 #include "xe_sched_job.h" 39 - #include "xe_ttm_gtt_mgr.h" 40 39 #include "xe_ttm_vram_mgr.h" 41 40 #include "xe_tuning.h" 42 41 #include "xe_uc.h" ··· 76 77 if (!gt->mem.vram_mgr) 77 78 return -ENOMEM; 78 79 79 - gt->mem.gtt_mgr = drmm_kzalloc(drm, sizeof(*gt->mem.gtt_mgr), 80 - GFP_KERNEL); 81 - if (!gt->mem.gtt_mgr) 82 - return -ENOMEM; 83 80 } else { 84 81 struct xe_gt *full_gt = xe_find_full_gt(gt); 85 82 86 83 gt->mem.ggtt = full_gt->mem.ggtt; 87 84 gt->mem.vram_mgr = full_gt->mem.vram_mgr; 88 - gt->mem.gtt_mgr = full_gt->mem.gtt_mgr; 89 85 } 90 86 91 87 gt->ordered_wq = alloc_ordered_workqueue("gt-ordered-wq", 0); ··· 92 98 { 93 99 struct xe_device *xe = gt_to_xe(gt); 94 100 int err; 95 - struct sysinfo si; 96 - u64 gtt_size; 97 - 98 - si_meminfo(&si); 99 - gtt_size = (u64)si.totalram * si.mem_unit * 3/4; 100 101 101 102 if (gt->mem.vram.size) { 102 103 err = xe_ttm_vram_mgr_init(gt, gt->mem.vram_mgr); 103 104 if (err) 104 105 return err; 105 - gtt_size = min(max((XE_DEFAULT_GTT_SIZE_MB << 20), 106 - (u64)gt->mem.vram.size), 107 - gtt_size); 108 106 xe->info.mem_region_mask |= BIT(gt->info.vram_id) << 1; 109 107 } 110 - 111 - err = xe_ttm_gtt_mgr_init(gt, gt->mem.gtt_mgr, gtt_size); 112 - if (err) 113 - return err; 114 108 115 109 return 0; 116 110 }
-2
drivers/gpu/drm/xe/xe_gt_types.h
··· 162 162 } vram; 163 163 /** @vram_mgr: VRAM TTM manager */ 164 164 struct xe_ttm_vram_mgr *vram_mgr; 165 - /** @gtt_mr: GTT TTM manager */ 166 - struct xe_ttm_gtt_mgr *gtt_mgr; 167 165 /** @ggtt: Global graphics translation table */ 168 166 struct xe_ggtt *ggtt; 169 167 } mem;
+27 -42
drivers/gpu/drm/xe/xe_ttm_gtt_mgr.c drivers/gpu/drm/xe/xe_ttm_sys_mgr.c
··· 4 4 * Copyright (C) 2021-2002 Red Hat 5 5 */ 6 6 7 + #include "xe_ttm_sys_mgr.h" 8 + 7 9 #include <drm/drm_managed.h> 8 10 9 11 #include <drm/ttm/ttm_placement.h> ··· 14 12 15 13 #include "xe_bo.h" 16 14 #include "xe_gt.h" 17 - #include "xe_ttm_gtt_mgr.h" 18 15 19 - struct xe_ttm_gtt_node { 16 + struct xe_ttm_sys_node { 20 17 struct ttm_buffer_object *tbo; 21 18 struct ttm_range_mgr_node base; 22 19 }; 23 20 24 - static inline struct xe_ttm_gtt_mgr * 25 - to_gtt_mgr(struct ttm_resource_manager *man) 21 + static inline struct xe_ttm_sys_node * 22 + to_xe_ttm_sys_node(struct ttm_resource *res) 26 23 { 27 - return container_of(man, struct xe_ttm_gtt_mgr, manager); 24 + return container_of(res, struct xe_ttm_sys_node, base.base); 28 25 } 29 26 30 - static inline struct xe_ttm_gtt_node * 31 - to_xe_ttm_gtt_node(struct ttm_resource *res) 32 - { 33 - return container_of(res, struct xe_ttm_gtt_node, base.base); 34 - } 35 - 36 - static int xe_ttm_gtt_mgr_new(struct ttm_resource_manager *man, 27 + static int xe_ttm_sys_mgr_new(struct ttm_resource_manager *man, 37 28 struct ttm_buffer_object *tbo, 38 29 const struct ttm_place *place, 39 30 struct ttm_resource **res) 40 31 { 41 - struct xe_ttm_gtt_node *node; 32 + struct xe_ttm_sys_node *node; 42 33 int r; 43 34 44 35 node = kzalloc(struct_size(node, base.mm_nodes, 1), GFP_KERNEL); ··· 61 66 return r; 62 67 } 63 68 64 - static void xe_ttm_gtt_mgr_del(struct ttm_resource_manager *man, 69 + static void xe_ttm_sys_mgr_del(struct ttm_resource_manager *man, 65 70 struct ttm_resource *res) 66 71 { 67 - struct xe_ttm_gtt_node *node = to_xe_ttm_gtt_node(res); 72 + struct xe_ttm_sys_node *node = to_xe_ttm_sys_node(res); 68 73 69 74 ttm_resource_fini(man, res); 70 75 kfree(node); 71 76 } 72 77 73 - static void xe_ttm_gtt_mgr_debug(struct ttm_resource_manager *man, 78 + static void xe_ttm_sys_mgr_debug(struct ttm_resource_manager *man, 74 79 struct drm_printer *printer) 75 80 { 76 81 77 82 } 78 83 79 - static const struct ttm_resource_manager_func xe_ttm_gtt_mgr_func = { 80 - .alloc = xe_ttm_gtt_mgr_new, 81 - .free = xe_ttm_gtt_mgr_del, 82 - .debug = xe_ttm_gtt_mgr_debug 84 + static const struct ttm_resource_manager_func xe_ttm_sys_mgr_func = { 85 + .alloc = xe_ttm_sys_mgr_new, 86 + .free = xe_ttm_sys_mgr_del, 87 + .debug = xe_ttm_sys_mgr_debug 83 88 }; 84 89 85 - static void ttm_gtt_mgr_fini(struct drm_device *drm, void *arg) 90 + static void ttm_sys_mgr_fini(struct drm_device *drm, void *arg) 86 91 { 87 - struct xe_ttm_gtt_mgr *mgr = arg; 88 - struct xe_device *xe = gt_to_xe(mgr->gt); 89 - struct ttm_resource_manager *man = &mgr->manager; 92 + struct xe_device *xe = (struct xe_device *)arg; 93 + struct ttm_resource_manager *man = &xe->mem.sys_mgr; 90 94 int err; 91 95 92 96 ttm_resource_manager_set_used(man, false); ··· 98 104 ttm_set_driver_manager(&xe->ttm, XE_PL_TT, NULL); 99 105 } 100 106 101 - int xe_ttm_gtt_mgr_init(struct xe_gt *gt, struct xe_ttm_gtt_mgr *mgr, 102 - u64 gtt_size) 107 + int xe_ttm_sys_mgr_init(struct xe_device *xe) 103 108 { 104 - struct xe_device *xe = gt_to_xe(gt); 105 - struct ttm_resource_manager *man = &mgr->manager; 106 - int err; 109 + struct ttm_resource_manager *man = &xe->mem.sys_mgr; 110 + struct sysinfo si; 111 + u64 gtt_size; 107 112 108 - XE_BUG_ON(xe_gt_is_media_type(gt)); 109 - 110 - mgr->gt = gt; 113 + si_meminfo(&si); 114 + gtt_size = (u64)si.totalram * si.mem_unit * 3/4; 111 115 man->use_tt = true; 112 - man->func = &xe_ttm_gtt_mgr_func; 113 - 116 + man->func = &xe_ttm_sys_mgr_func; 114 117 ttm_resource_manager_init(man, &xe->ttm, gtt_size >> PAGE_SHIFT); 115 - 116 - ttm_set_driver_manager(&xe->ttm, XE_PL_TT, &mgr->manager); 118 + ttm_set_driver_manager(&xe->ttm, XE_PL_TT, man); 117 119 ttm_resource_manager_set_used(man, true); 118 - 119 - err = drmm_add_action_or_reset(&xe->drm, ttm_gtt_mgr_fini, mgr); 120 - if (err) 121 - return err; 122 - 123 - return 0; 120 + return drmm_add_action_or_reset(&xe->drm, ttm_sys_mgr_fini, xe); 124 121 }
-16
drivers/gpu/drm/xe/xe_ttm_gtt_mgr.h
··· 1 - /* SPDX-License-Identifier: MIT */ 2 - /* 3 - * Copyright © 2022 Intel Corporation 4 - */ 5 - 6 - #ifndef _XE_TTGM_GTT_MGR_H_ 7 - #define _XE_TTGM_GTT_MGR_H_ 8 - 9 - #include "xe_ttm_gtt_mgr_types.h" 10 - 11 - struct xe_gt; 12 - 13 - int xe_ttm_gtt_mgr_init(struct xe_gt *gt, struct xe_ttm_gtt_mgr *mgr, 14 - u64 gtt_size); 15 - 16 - #endif
-18
drivers/gpu/drm/xe/xe_ttm_gtt_mgr_types.h
··· 1 - /* SPDX-License-Identifier: MIT */ 2 - /* 3 - * Copyright © 2022 Intel Corporation 4 - */ 5 - 6 - #ifndef _XE_TTM_GTT_MGR_TYPES_H_ 7 - #define _XE_TTM_GTT_MGR_TYPES_H_ 8 - 9 - #include <drm/ttm/ttm_device.h> 10 - 11 - struct xe_gt; 12 - 13 - struct xe_ttm_gtt_mgr { 14 - struct xe_gt *gt; 15 - struct ttm_resource_manager manager; 16 - }; 17 - 18 - #endif
+13
drivers/gpu/drm/xe/xe_ttm_sys_mgr.h
··· 1 + /* SPDX-License-Identifier: MIT */ 2 + /* 3 + * Copyright © 2023 Intel Corporation 4 + */ 5 + 6 + #ifndef _XE_TTM_SYS_MGR_H_ 7 + #define _XE_TTM_SYS_MGR_H_ 8 + 9 + struct xe_device; 10 + 11 + int xe_ttm_sys_mgr_init(struct xe_device *xe); 12 + 13 + #endif