Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdgpu: Support nbif v6_3_1 fatal error handling

Add nbif v6_3_1 fatal error handling support.

Signed-off-by: Candice Li <candice.li@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Candice Li and committed by
Alex Deucher
ecd1191e 58893392

+111 -2
+12
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
··· 36 36 #include "amdgpu_xgmi.h" 37 37 #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" 38 38 #include "nbio_v4_3.h" 39 + #include "nbif_v6_3_1.h" 39 40 #include "nbio_v7_9.h" 40 41 #include "atom.h" 41 42 #include "amdgpu_reset.h" ··· 3911 3910 * enable nbio ras in such case. Instead, 3912 3911 * check DF RAS */ 3913 3912 adev->nbio.ras = &nbio_v4_3_ras; 3913 + break; 3914 + case IP_VERSION(6, 3, 1): 3915 + if (adev->ras_hw_enabled & (1 << AMDGPU_RAS_BLOCK__DF)) 3916 + /* unlike other generation of nbio ras, 3917 + * nbif v6_3_1 only support fatal error interrupt 3918 + * to inform software that DF is freezed due to 3919 + * system fatal error event. driver should not 3920 + * enable nbio ras in such case. Instead, 3921 + * check DF RAS 3922 + */ 3923 + adev->nbio.ras = &nbif_v6_3_1_ras; 3914 3924 break; 3915 3925 case IP_VERSION(7, 9, 0): 3916 3926 case IP_VERSION(7, 9, 1):
+81
drivers/gpu/drm/amd/amdgpu/nbif_v6_3_1.c
··· 28 28 #include "nbif/nbif_6_3_1_sh_mask.h" 29 29 #include "pcie/pcie_6_1_0_offset.h" 30 30 #include "pcie/pcie_6_1_0_sh_mask.h" 31 + #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" 31 32 #include <uapi/linux/kfd_ioctl.h> 32 33 33 34 static void nbif_v6_3_1_remap_hdp_registers(struct amdgpu_device *adev) ··· 518 517 .remap_hdp_registers = nbif_v6_3_1_remap_hdp_registers, 519 518 .get_rom_offset = nbif_v6_3_1_get_rom_offset, 520 519 .set_reg_remap = nbif_v6_3_1_set_reg_remap, 520 + }; 521 + 522 + static int nbif_v6_3_1_set_ras_err_event_athub_irq_state(struct amdgpu_device *adev, 523 + struct amdgpu_irq_src *src, 524 + unsigned type, 525 + enum amdgpu_interrupt_state state) 526 + { 527 + /* The ras_controller_irq enablement should be done in psp bl when it 528 + * tries to enable ras feature. Driver only need to set the correct interrupt 529 + * vector for bare-metal and sriov use case respectively 530 + */ 531 + uint32_t bif_doorbell_int_cntl; 532 + 533 + bif_doorbell_int_cntl = RREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL); 534 + bif_doorbell_int_cntl = REG_SET_FIELD(bif_doorbell_int_cntl, 535 + BIF_BX0_BIF_DOORBELL_INT_CNTL, 536 + RAS_ATHUB_ERR_EVENT_INTERRUPT_DISABLE, 537 + (state == AMDGPU_IRQ_STATE_ENABLE) ? 0 : 1); 538 + WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, bif_doorbell_int_cntl); 539 + 540 + return 0; 541 + } 542 + 543 + static int nbif_v6_3_1_process_err_event_athub_irq(struct amdgpu_device *adev, 544 + struct amdgpu_irq_src *source, 545 + struct amdgpu_iv_entry *entry) 546 + { 547 + /* By design, the ih cookie for err_event_athub_irq should be written 548 + * to bif ring. since bif ring is not enabled, just leave process callback 549 + * as a dummy one. 550 + */ 551 + return 0; 552 + } 553 + 554 + static const struct amdgpu_irq_src_funcs nbif_v6_3_1_ras_err_event_athub_irq_funcs = { 555 + .set = nbif_v6_3_1_set_ras_err_event_athub_irq_state, 556 + .process = nbif_v6_3_1_process_err_event_athub_irq, 557 + }; 558 + 559 + static void nbif_v6_3_1_handle_ras_err_event_athub_intr_no_bifring(struct amdgpu_device *adev) 560 + { 561 + uint32_t bif_doorbell_int_cntl; 562 + 563 + bif_doorbell_int_cntl = RREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL); 564 + if (REG_GET_FIELD(bif_doorbell_int_cntl, 565 + BIF_BX0_BIF_DOORBELL_INT_CNTL, 566 + RAS_ATHUB_ERR_EVENT_INTERRUPT_STATUS)) { 567 + /* driver has to clear the interrupt status when bif ring is disabled */ 568 + bif_doorbell_int_cntl = REG_SET_FIELD(bif_doorbell_int_cntl, 569 + BIF_BX0_BIF_DOORBELL_INT_CNTL, 570 + RAS_ATHUB_ERR_EVENT_INTERRUPT_CLEAR, 1); 571 + WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, bif_doorbell_int_cntl); 572 + amdgpu_ras_global_ras_isr(adev); 573 + } 574 + } 575 + 576 + static int nbif_v6_3_1_init_ras_err_event_athub_interrupt(struct amdgpu_device *adev) 577 + { 578 + int r; 579 + 580 + /* init the irq funcs */ 581 + adev->nbio.ras_err_event_athub_irq.funcs = 582 + &nbif_v6_3_1_ras_err_event_athub_irq_funcs; 583 + adev->nbio.ras_err_event_athub_irq.num_types = 1; 584 + 585 + /* register ras err event athub interrupt 586 + * nbif v6_3_1 uses the same irq source as nbio v7_4 587 + */ 588 + r = amdgpu_irq_add_id(adev, SOC21_IH_CLIENTID_BIF, 589 + NBIF_7_4__SRCID__ERREVENT_ATHUB_INTERRUPT, 590 + &adev->nbio.ras_err_event_athub_irq); 591 + 592 + return r; 593 + } 594 + 595 + struct amdgpu_nbio_ras nbif_v6_3_1_ras = { 596 + .handle_ras_err_event_athub_intr_no_bifring = 597 + nbif_v6_3_1_handle_ras_err_event_athub_intr_no_bifring, 598 + .init_ras_err_event_athub_interrupt = 599 + nbif_v6_3_1_init_ras_err_event_athub_interrupt, 521 600 };
+1
drivers/gpu/drm/amd/amdgpu/nbif_v6_3_1.h
··· 29 29 extern const struct nbio_hdp_flush_reg nbif_v6_3_1_hdp_flush_reg; 30 30 extern const struct amdgpu_nbio_funcs nbif_v6_3_1_funcs; 31 31 extern const struct amdgpu_nbio_funcs nbif_v6_3_1_sriov_funcs; 32 + extern struct amdgpu_nbio_ras nbif_v6_3_1_ras; 32 33 33 34 #endif
+17 -2
drivers/gpu/drm/amd/amdgpu/soc24.c
··· 444 444 { 445 445 struct amdgpu_device *adev = ip_block->adev; 446 446 447 - if (amdgpu_sriov_vf(adev)) 447 + if (amdgpu_sriov_vf(adev)) { 448 448 xgpu_nv_mailbox_get_irq(adev); 449 + } else { 450 + if (adev->nbio.ras && 451 + adev->nbio.ras_err_event_athub_irq.funcs) 452 + /* don't need to fail gpu late init 453 + * if enabling athub_err_event interrupt failed 454 + * nbif v6_3_1 only support fatal error hanlding 455 + * just enable the interrupt directly 456 + */ 457 + amdgpu_irq_get(adev, &adev->nbio.ras_err_event_athub_irq, 0); 458 + } 449 459 450 460 /* Enable selfring doorbell aperture late because doorbell BAR 451 461 * aperture will change if resize BAR successfully in gmc sw_init. ··· 511 501 adev->nbio.funcs->enable_doorbell_aperture(adev, false); 512 502 adev->nbio.funcs->enable_doorbell_selfring_aperture(adev, false); 513 503 514 - if (amdgpu_sriov_vf(adev)) 504 + if (amdgpu_sriov_vf(adev)) { 515 505 xgpu_nv_mailbox_put_irq(adev); 506 + } else { 507 + if (adev->nbio.ras && 508 + adev->nbio.ras_err_event_athub_irq.funcs) 509 + amdgpu_irq_put(adev, &adev->nbio.ras_err_event_athub_irq, 0); 510 + } 516 511 517 512 return 0; 518 513 }