Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdgpu: Fix crash on device remove/driver unload

Crash:
BUG: unable to handle page fault for address: 00000000000010e1
RIP: 0010:vega10_power_gate_vce+0x26/0x50 [amdgpu]
Call Trace:
pp_set_powergating_by_smu+0x16a/0x2b0 [amdgpu]
amdgpu_dpm_set_powergating_by_smu+0x92/0xf0 [amdgpu]
amdgpu_dpm_enable_vce+0x2e/0xc0 [amdgpu]
vce_v4_0_hw_fini+0x95/0xa0 [amdgpu]
amdgpu_device_fini_hw+0x232/0x30d [amdgpu]
amdgpu_driver_unload_kms+0x5c/0x80 [amdgpu]
amdgpu_pci_remove+0x27/0x40 [amdgpu]
pci_device_remove+0x3e/0xb0
device_release_driver_internal+0x103/0x1d0
device_release_driver+0x12/0x20
pci_stop_bus_device+0x79/0xa0
pci_stop_and_remove_bus_device_locked+0x1b/0x30
remove_store+0x7b/0x90
dev_attr_store+0x17/0x30
sysfs_kf_write+0x4b/0x60
kernfs_fop_write_iter+0x151/0x1e0

Why:
VCE/UVD had dependency on SMC block for their suspend but
SMC block is the first to do HW fini due to some constraints

How:
Since the original patch was dealing with suspend issues
move the SMC block dependency back into suspend hooks as
was done in V1 of the original patches.
Keep flushing idle work both in suspend and HW fini seuqnces
since it's essential in both cases.

Fixes: 859e4659273f1d ("drm/amdgpu: add missing cleanups for more ASICs on UVD/VCE suspend")
Fixes: bf756fb833cbe8 ("drm/amdgpu: add missing cleanups for Polaris12 UVD/VCE on suspend")
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Andrey Grodzovsky and committed by
Alex Deucher
d82e2c24 0a226780

+105 -90
+13 -11
drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
··· 698 698 { 699 699 struct amdgpu_device *adev = (struct amdgpu_device *)handle; 700 700 701 + cancel_delayed_work_sync(&adev->uvd.idle_work); 702 + 703 + if (RREG32(mmUVD_STATUS) != 0) 704 + uvd_v3_1_stop(adev); 705 + 706 + return 0; 707 + } 708 + 709 + static int uvd_v3_1_suspend(void *handle) 710 + { 711 + int r; 712 + struct amdgpu_device *adev = (struct amdgpu_device *)handle; 713 + 701 714 /* 702 715 * Proper cleanups before halting the HW engine: 703 716 * - cancel the delayed idle work ··· 734 721 amdgpu_device_ip_set_clockgating_state(adev, AMD_IP_BLOCK_TYPE_UVD, 735 722 AMD_CG_STATE_GATE); 736 723 } 737 - 738 - if (RREG32(mmUVD_STATUS) != 0) 739 - uvd_v3_1_stop(adev); 740 - 741 - return 0; 742 - } 743 - 744 - static int uvd_v3_1_suspend(void *handle) 745 - { 746 - int r; 747 - struct amdgpu_device *adev = (struct amdgpu_device *)handle; 748 724 749 725 r = uvd_v3_1_hw_fini(adev); 750 726 if (r)
+13 -11
drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
··· 212 212 { 213 213 struct amdgpu_device *adev = (struct amdgpu_device *)handle; 214 214 215 + cancel_delayed_work_sync(&adev->uvd.idle_work); 216 + 217 + if (RREG32(mmUVD_STATUS) != 0) 218 + uvd_v4_2_stop(adev); 219 + 220 + return 0; 221 + } 222 + 223 + static int uvd_v4_2_suspend(void *handle) 224 + { 225 + int r; 226 + struct amdgpu_device *adev = (struct amdgpu_device *)handle; 227 + 215 228 /* 216 229 * Proper cleanups before halting the HW engine: 217 230 * - cancel the delayed idle work ··· 248 235 amdgpu_device_ip_set_clockgating_state(adev, AMD_IP_BLOCK_TYPE_UVD, 249 236 AMD_CG_STATE_GATE); 250 237 } 251 - 252 - if (RREG32(mmUVD_STATUS) != 0) 253 - uvd_v4_2_stop(adev); 254 - 255 - return 0; 256 - } 257 - 258 - static int uvd_v4_2_suspend(void *handle) 259 - { 260 - int r; 261 - struct amdgpu_device *adev = (struct amdgpu_device *)handle; 262 238 263 239 r = uvd_v4_2_hw_fini(adev); 264 240 if (r)
+13 -11
drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
··· 210 210 { 211 211 struct amdgpu_device *adev = (struct amdgpu_device *)handle; 212 212 213 + cancel_delayed_work_sync(&adev->uvd.idle_work); 214 + 215 + if (RREG32(mmUVD_STATUS) != 0) 216 + uvd_v5_0_stop(adev); 217 + 218 + return 0; 219 + } 220 + 221 + static int uvd_v5_0_suspend(void *handle) 222 + { 223 + int r; 224 + struct amdgpu_device *adev = (struct amdgpu_device *)handle; 225 + 213 226 /* 214 227 * Proper cleanups before halting the HW engine: 215 228 * - cancel the delayed idle work ··· 246 233 amdgpu_device_ip_set_clockgating_state(adev, AMD_IP_BLOCK_TYPE_UVD, 247 234 AMD_CG_STATE_GATE); 248 235 } 249 - 250 - if (RREG32(mmUVD_STATUS) != 0) 251 - uvd_v5_0_stop(adev); 252 - 253 - return 0; 254 - } 255 - 256 - static int uvd_v5_0_suspend(void *handle) 257 - { 258 - int r; 259 - struct amdgpu_device *adev = (struct amdgpu_device *)handle; 260 236 261 237 r = uvd_v5_0_hw_fini(adev); 262 238 if (r)
+17 -15
drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
··· 597 597 { 598 598 struct amdgpu_device *adev = (struct amdgpu_device *)handle; 599 599 600 + cancel_delayed_work_sync(&adev->uvd.idle_work); 601 + 602 + if (!amdgpu_sriov_vf(adev)) 603 + uvd_v7_0_stop(adev); 604 + else { 605 + /* full access mode, so don't touch any UVD register */ 606 + DRM_DEBUG("For SRIOV client, shouldn't do anything.\n"); 607 + } 608 + 609 + return 0; 610 + } 611 + 612 + static int uvd_v7_0_suspend(void *handle) 613 + { 614 + int r; 615 + struct amdgpu_device *adev = (struct amdgpu_device *)handle; 616 + 600 617 /* 601 618 * Proper cleanups before halting the HW engine: 602 619 * - cancel the delayed idle work ··· 637 620 amdgpu_device_ip_set_clockgating_state(adev, AMD_IP_BLOCK_TYPE_UVD, 638 621 AMD_CG_STATE_GATE); 639 622 } 640 - 641 - if (!amdgpu_sriov_vf(adev)) 642 - uvd_v7_0_stop(adev); 643 - else { 644 - /* full access mode, so don't touch any UVD register */ 645 - DRM_DEBUG("For SRIOV client, shouldn't do anything.\n"); 646 - } 647 - 648 - return 0; 649 - } 650 - 651 - static int uvd_v7_0_suspend(void *handle) 652 - { 653 - int r; 654 - struct amdgpu_device *adev = (struct amdgpu_device *)handle; 655 623 656 624 r = uvd_v7_0_hw_fini(adev); 657 625 if (r)
+11 -8
drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
··· 481 481 { 482 482 struct amdgpu_device *adev = (struct amdgpu_device *)handle; 483 483 484 + cancel_delayed_work_sync(&adev->vce.idle_work); 485 + 486 + return 0; 487 + } 488 + 489 + static int vce_v2_0_suspend(void *handle) 490 + { 491 + int r; 492 + struct amdgpu_device *adev = (struct amdgpu_device *)handle; 493 + 494 + 484 495 /* 485 496 * Proper cleanups before halting the HW engine: 486 497 * - cancel the delayed idle work ··· 514 503 amdgpu_device_ip_set_clockgating_state(adev, AMD_IP_BLOCK_TYPE_VCE, 515 504 AMD_CG_STATE_GATE); 516 505 } 517 - 518 - return 0; 519 - } 520 - 521 - static int vce_v2_0_suspend(void *handle) 522 - { 523 - int r; 524 - struct amdgpu_device *adev = (struct amdgpu_device *)handle; 525 506 526 507 r = vce_v2_0_hw_fini(adev); 527 508 if (r)
+15 -13
drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
··· 492 492 int r; 493 493 struct amdgpu_device *adev = (struct amdgpu_device *)handle; 494 494 495 + cancel_delayed_work_sync(&adev->vce.idle_work); 496 + 497 + r = vce_v3_0_wait_for_idle(handle); 498 + if (r) 499 + return r; 500 + 501 + vce_v3_0_stop(adev); 502 + return vce_v3_0_set_clockgating_state(adev, AMD_CG_STATE_GATE); 503 + } 504 + 505 + static int vce_v3_0_suspend(void *handle) 506 + { 507 + int r; 508 + struct amdgpu_device *adev = (struct amdgpu_device *)handle; 509 + 495 510 /* 496 511 * Proper cleanups before halting the HW engine: 497 512 * - cancel the delayed idle work ··· 529 514 amdgpu_device_ip_set_clockgating_state(adev, AMD_IP_BLOCK_TYPE_VCE, 530 515 AMD_CG_STATE_GATE); 531 516 } 532 - 533 - r = vce_v3_0_wait_for_idle(handle); 534 - if (r) 535 - return r; 536 - 537 - vce_v3_0_stop(adev); 538 - return vce_v3_0_set_clockgating_state(adev, AMD_CG_STATE_GATE); 539 - } 540 - 541 - static int vce_v3_0_suspend(void *handle) 542 - { 543 - int r; 544 - struct amdgpu_device *adev = (struct amdgpu_device *)handle; 545 517 546 518 r = vce_v3_0_hw_fini(adev); 547 519 if (r)
+23 -21
drivers/gpu/drm/amd/amdgpu/vce_v4_0.c
··· 544 544 { 545 545 struct amdgpu_device *adev = (struct amdgpu_device *)handle; 546 546 547 - /* 548 - * Proper cleanups before halting the HW engine: 549 - * - cancel the delayed idle work 550 - * - enable powergating 551 - * - enable clockgating 552 - * - disable dpm 553 - * 554 - * TODO: to align with the VCN implementation, move the 555 - * jobs for clockgating/powergating/dpm setting to 556 - * ->set_powergating_state(). 557 - */ 558 547 cancel_delayed_work_sync(&adev->vce.idle_work); 559 - 560 - if (adev->pm.dpm_enabled) { 561 - amdgpu_dpm_enable_vce(adev, false); 562 - } else { 563 - amdgpu_asic_set_vce_clocks(adev, 0, 0); 564 - amdgpu_device_ip_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_VCE, 565 - AMD_PG_STATE_GATE); 566 - amdgpu_device_ip_set_clockgating_state(adev, AMD_IP_BLOCK_TYPE_VCE, 567 - AMD_CG_STATE_GATE); 568 - } 569 548 570 549 if (!amdgpu_sriov_vf(adev)) { 571 550 /* vce_v4_0_wait_for_idle(handle); */ ··· 573 594 memcpy_fromio(adev->vce.saved_bo, ptr, size); 574 595 } 575 596 drm_dev_exit(idx); 597 + } 598 + 599 + /* 600 + * Proper cleanups before halting the HW engine: 601 + * - cancel the delayed idle work 602 + * - enable powergating 603 + * - enable clockgating 604 + * - disable dpm 605 + * 606 + * TODO: to align with the VCN implementation, move the 607 + * jobs for clockgating/powergating/dpm setting to 608 + * ->set_powergating_state(). 609 + */ 610 + cancel_delayed_work_sync(&adev->vce.idle_work); 611 + 612 + if (adev->pm.dpm_enabled) { 613 + amdgpu_dpm_enable_vce(adev, false); 614 + } else { 615 + amdgpu_asic_set_vce_clocks(adev, 0, 0); 616 + amdgpu_device_ip_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_VCE, 617 + AMD_PG_STATE_GATE); 618 + amdgpu_device_ip_set_clockgating_state(adev, AMD_IP_BLOCK_TYPE_VCE, 619 + AMD_CG_STATE_GATE); 576 620 } 577 621 578 622 r = vce_v4_0_hw_fini(adev);