Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdkfd: Remove arbitrary timeout for hmm_range_fault

On system with khugepaged enabled and user cases with THP buffer, the
hmm_range_fault may takes > 15 seconds to return -EBUSY, the arbitrary
timeout value is not accurate, cause memory allocation failure.

Remove the arbitrary timeout value, return EAGAIN to application if
hmm_range_fault return EBUSY, then userspace libdrm and Thunk will call
ioctl again.

Change EAGAIN to debug message as this is not error.

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Philip Yang and committed by
Alex Deucher
9095e554 10f624ef

+8 -14
+4 -1
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
··· 1088 1088 1089 1089 ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages, &range); 1090 1090 if (ret) { 1091 - pr_err("%s: Failed to get user pages: %d\n", __func__, ret); 1091 + if (ret == -EAGAIN) 1092 + pr_debug("Failed to get user pages, try again\n"); 1093 + else 1094 + pr_err("%s: Failed to get user pages: %d\n", __func__, ret); 1092 1095 goto unregister_out; 1093 1096 } 1094 1097
+3 -9
drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c
··· 202 202 pr_debug("hmm range: start = 0x%lx, end = 0x%lx", 203 203 hmm_range->start, hmm_range->end); 204 204 205 - /* Assuming 64MB takes maximum 1 second to fault page address */ 206 - timeout = max((hmm_range->end - hmm_range->start) >> 26, 1UL); 207 - timeout *= HMM_RANGE_DEFAULT_TIMEOUT; 208 - timeout = jiffies + msecs_to_jiffies(timeout); 205 + timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); 209 206 210 207 retry: 211 208 hmm_range->notifier_seq = mmu_interval_read_begin(notifier); 212 209 r = hmm_range_fault(hmm_range); 213 210 if (unlikely(r)) { 214 - schedule(); 215 - /* 216 - * FIXME: This timeout should encompass the retry from 217 - * mmu_interval_read_retry() as well. 218 - */ 219 211 if (r == -EBUSY && !time_after(jiffies, timeout)) 220 212 goto retry; 221 213 goto out_free_pfns; ··· 239 247 out_free_range: 240 248 kfree(hmm_range); 241 249 250 + if (r == -EBUSY) 251 + r = -EAGAIN; 242 252 return r; 243 253 } 244 254
+1 -4
drivers/gpu/drm/amd/amdkfd/kfd_svm.c
··· 1690 1690 readonly, owner, NULL, 1691 1691 &hmm_range); 1692 1692 WRITE_ONCE(p->svms.faulting_task, NULL); 1693 - if (r) { 1693 + if (r) 1694 1694 pr_debug("failed %d to get svm range pages\n", r); 1695 - if (r == -EBUSY) 1696 - r = -EAGAIN; 1697 - } 1698 1695 } else { 1699 1696 r = -EFAULT; 1700 1697 }