Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdgpu: revert "Add autodump debugfs node for gpu reset v8"

This reverts commit 728e7e0cd61899208e924472b9e641dbeb0775c4.

Further discussion reveals that this feature is severely broken
and needs to be reverted ASAP.

GPU reset can never be delayed by userspace even for debugging or
otherwise we can run into in kernel deadlocks.

Signed-off-by: Christian König <christian.koenig@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Acked-by: Nirmoy Das <nirmoy.das@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Christian König and committed by
Alex Deucher
c8365dbd 286826d7

-91
-2
drivers/gpu/drm/amd/amdgpu/amdgpu.h
··· 1085 1085 char product_name[32]; 1086 1086 char serial[20]; 1087 1087 1088 - struct amdgpu_autodump autodump; 1089 - 1090 1088 atomic_t throttling_logging_enabled; 1091 1089 struct ratelimit_state throttling_logging_rs; 1092 1090 uint32_t ras_hw_enabled;
-80
drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
··· 27 27 #include <linux/pci.h> 28 28 #include <linux/uaccess.h> 29 29 #include <linux/pm_runtime.h> 30 - #include <linux/poll.h> 31 30 32 31 #include "amdgpu.h" 33 32 #include "amdgpu_pm.h" ··· 37 38 #include "amdgpu_fw_attestation.h" 38 39 #include "amdgpu_umr.h" 39 40 40 - int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev) 41 - { 42 41 #if defined(CONFIG_DEBUG_FS) 43 - unsigned long timeout = 600 * HZ; 44 - int ret; 45 - 46 - wake_up_interruptible(&adev->autodump.gpu_hang); 47 - 48 - ret = wait_for_completion_interruptible_timeout(&adev->autodump.dumping, timeout); 49 - if (ret == 0) { 50 - pr_err("autodump: timeout, move on to gpu recovery\n"); 51 - return -ETIMEDOUT; 52 - } 53 - #endif 54 - return 0; 55 - } 56 - 57 - #if defined(CONFIG_DEBUG_FS) 58 - 59 - static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file) 60 - { 61 - struct amdgpu_device *adev = inode->i_private; 62 - int ret; 63 - 64 - file->private_data = adev; 65 - 66 - ret = down_read_killable(&adev->reset_sem); 67 - if (ret) 68 - return ret; 69 - 70 - if (adev->autodump.dumping.done) { 71 - reinit_completion(&adev->autodump.dumping); 72 - ret = 0; 73 - } else { 74 - ret = -EBUSY; 75 - } 76 - 77 - up_read(&adev->reset_sem); 78 - 79 - return ret; 80 - } 81 - 82 - static int amdgpu_debugfs_autodump_release(struct inode *inode, struct file *file) 83 - { 84 - struct amdgpu_device *adev = file->private_data; 85 - 86 - complete_all(&adev->autodump.dumping); 87 - return 0; 88 - } 89 - 90 - static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct poll_table_struct *poll_table) 91 - { 92 - struct amdgpu_device *adev = file->private_data; 93 - 94 - poll_wait(file, &adev->autodump.gpu_hang, poll_table); 95 - 96 - if (amdgpu_in_reset(adev)) 97 - return POLLIN | POLLRDNORM | POLLWRNORM; 98 - 99 - return 0; 100 - } 101 - 102 - static const struct file_operations autodump_debug_fops = { 103 - .owner = THIS_MODULE, 104 - .open = amdgpu_debugfs_autodump_open, 105 - .poll = amdgpu_debugfs_autodump_poll, 106 - .release = amdgpu_debugfs_autodump_release, 107 - }; 108 - 109 - static void amdgpu_debugfs_autodump_init(struct amdgpu_device *adev) 110 - { 111 - init_completion(&adev->autodump.dumping); 112 - complete_all(&adev->autodump.dumping); 113 - init_waitqueue_head(&adev->autodump.gpu_hang); 114 - 115 - debugfs_create_file("amdgpu_autodump", 0600, 116 - adev_to_drm(adev)->primary->debugfs_root, 117 - adev, &autodump_debug_fops); 118 - } 119 42 120 43 /** 121 44 * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes ··· 1659 1738 } 1660 1739 1661 1740 amdgpu_ras_debugfs_create_all(adev); 1662 - amdgpu_debugfs_autodump_init(adev); 1663 1741 amdgpu_rap_debugfs_init(adev); 1664 1742 amdgpu_securedisplay_debugfs_init(adev); 1665 1743 amdgpu_fw_attestation_debugfs_init(adev);
-5
drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
··· 25 25 /* 26 26 * Debugfs 27 27 */ 28 - struct amdgpu_autodump { 29 - struct completion dumping; 30 - struct wait_queue_head gpu_hang; 31 - }; 32 28 33 29 int amdgpu_debugfs_regs_init(struct amdgpu_device *adev); 34 30 int amdgpu_debugfs_init(struct amdgpu_device *adev); ··· 32 36 void amdgpu_debugfs_fence_init(struct amdgpu_device *adev); 33 37 void amdgpu_debugfs_firmware_init(struct amdgpu_device *adev); 34 38 void amdgpu_debugfs_gem_init(struct amdgpu_device *adev); 35 - int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev);
-4
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
··· 4440 4440 if (reset_context->reset_req_dev == adev) 4441 4441 job = reset_context->job; 4442 4442 4443 - /* no need to dump if device is not in good state during probe period */ 4444 - if (!adev->gmc.xgmi.pending_reset) 4445 - amdgpu_debugfs_wait_dump(adev); 4446 - 4447 4443 if (amdgpu_sriov_vf(adev)) { 4448 4444 /* stop the data exchange thread */ 4449 4445 amdgpu_virt_fini_data_exchange(adev);