Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdkfd: avoid HMM change cause circular lock

There is circular lock between gfx and kfd path with HMM change:
lock(dqm) -> bo::reserve -> amdgpu_mn_lock

To avoid this, move init/unint_mqd() out of lock(dqm), to remove nested
locking between mmap_sem and bo::reserve. The locking order
is: bo::reserve -> amdgpu_mn_lock(p->mn)

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Philip Yang and committed by
Alex Deucher
89cd9d23 2c5a51f5

+17 -15
+17 -15
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
··· 1161 1161 int retval; 1162 1162 struct mqd_manager *mqd_mgr; 1163 1163 1164 - retval = 0; 1165 - 1166 - dqm_lock(dqm); 1167 - 1168 1164 if (dqm->total_queue_count >= max_num_of_queues_per_device) { 1169 1165 pr_warn("Can't create new usermode queue because %d queues were already created\n", 1170 1166 dqm->total_queue_count); 1171 1167 retval = -EPERM; 1172 - goto out_unlock; 1168 + goto out; 1173 1169 } 1174 1170 1175 1171 if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { 1176 1172 retval = allocate_sdma_queue(dqm, &q->sdma_id); 1177 1173 if (retval) 1178 - goto out_unlock; 1174 + goto out; 1179 1175 q->properties.sdma_queue_id = 1180 1176 q->sdma_id / get_num_sdma_engines(dqm); 1181 1177 q->properties.sdma_engine_id = ··· 1185 1189 if (retval) 1186 1190 goto out_deallocate_sdma_queue; 1187 1191 1192 + /* Do init_mqd before dqm_lock(dqm) to avoid circular locking order: 1193 + * lock(dqm) -> bo::reserve 1194 + */ 1188 1195 mqd_mgr = dqm->ops.get_mqd_manager(dqm, 1189 1196 get_mqd_type_from_queue_type(q->properties.type)); 1190 1197 ··· 1195 1196 retval = -ENOMEM; 1196 1197 goto out_deallocate_doorbell; 1197 1198 } 1199 + 1198 1200 /* 1199 1201 * Eviction state logic: we only mark active queues as evicted 1200 1202 * to avoid the overhead of restoring inactive queues later ··· 1204 1204 q->properties.is_evicted = (q->properties.queue_size > 0 && 1205 1205 q->properties.queue_percent > 0 && 1206 1206 q->properties.queue_address != 0); 1207 - 1208 1207 dqm->asic_ops.init_sdma_vm(dqm, q, qpd); 1209 - 1210 1208 q->properties.tba_addr = qpd->tba_addr; 1211 1209 q->properties.tma_addr = qpd->tma_addr; 1212 1210 retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj, 1213 1211 &q->gart_mqd_addr, &q->properties); 1214 1212 if (retval) 1215 1213 goto out_deallocate_doorbell; 1214 + 1215 + dqm_lock(dqm); 1216 1216 1217 1217 list_add(&q->list, &qpd->queues_list); 1218 1218 qpd->queue_count++; ··· 1241 1241 out_deallocate_sdma_queue: 1242 1242 if (q->properties.type == KFD_QUEUE_TYPE_SDMA) 1243 1243 deallocate_sdma_queue(dqm, q->sdma_id); 1244 - out_unlock: 1245 - dqm_unlock(dqm); 1246 - 1244 + out: 1247 1245 return retval; 1248 1246 } 1249 1247 ··· 1404 1406 qpd->reset_wavefronts = true; 1405 1407 } 1406 1408 1407 - mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); 1408 - 1409 1409 /* 1410 1410 * Unconditionally decrement this counter, regardless of the queue's 1411 1411 * type ··· 1413 1417 dqm->total_queue_count); 1414 1418 1415 1419 dqm_unlock(dqm); 1420 + 1421 + /* Do uninit_mqd after dqm_unlock(dqm) to avoid circular locking */ 1422 + mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); 1416 1423 1417 1424 return retval; 1418 1425 ··· 1640 1641 qpd->reset_wavefronts = false; 1641 1642 } 1642 1643 1643 - /* lastly, free mqd resources */ 1644 + dqm_unlock(dqm); 1645 + 1646 + /* Lastly, free mqd resources. 1647 + * Do uninit_mqd() after dqm_unlock to avoid circular locking. 1648 + */ 1644 1649 list_for_each_entry_safe(q, next, &qpd->queues_list, list) { 1645 1650 mqd_mgr = dqm->ops.get_mqd_manager(dqm, 1646 1651 get_mqd_type_from_queue_type(q->properties.type)); ··· 1658 1655 } 1659 1656 1660 1657 out: 1661 - dqm_unlock(dqm); 1662 1658 return retval; 1663 1659 } 1664 1660