Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdgpu: split nbio callbacks into ras and non-ras ones

nbio ras is not managed by gpu driver when gpu is
connected to cpu through xgmi. split nbio callbacks
into ras and non-ras ones so gpu driver only
initializes nbio ras callbacks when it manages
nbio ras.

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Dennis Li <Dennis.Li@amd.com>
Reviewed-by: John Clements <John.Clements@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Hawking Zhang and committed by
Alex Deucher
6e36f231 87da0cc1

+63 -30
+6 -6
drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
··· 199 199 * ack the interrupt if it is there 200 200 */ 201 201 if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__PCIE_BIF)) { 202 - if (adev->nbio.funcs && 203 - adev->nbio.funcs->handle_ras_controller_intr_no_bifring) 204 - adev->nbio.funcs->handle_ras_controller_intr_no_bifring(adev); 202 + if (adev->nbio.ras_funcs && 203 + adev->nbio.ras_funcs->handle_ras_controller_intr_no_bifring) 204 + adev->nbio.ras_funcs->handle_ras_controller_intr_no_bifring(adev); 205 205 206 - if (adev->nbio.funcs && 207 - adev->nbio.funcs->handle_ras_err_event_athub_intr_no_bifring) 208 - adev->nbio.funcs->handle_ras_err_event_athub_intr_no_bifring(adev); 206 + if (adev->nbio.ras_funcs && 207 + adev->nbio.ras_funcs->handle_ras_err_event_athub_intr_no_bifring) 208 + adev->nbio.ras_funcs->handle_ras_err_event_athub_intr_no_bifring(adev); 209 209 } 210 210 211 211 return ret;
+12 -7
drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
··· 47 47 u32 ref_and_mask_sdma7; 48 48 }; 49 49 50 + struct amdgpu_nbio_ras_funcs { 51 + void (*handle_ras_controller_intr_no_bifring)(struct amdgpu_device *adev); 52 + void (*handle_ras_err_event_athub_intr_no_bifring)(struct amdgpu_device *adev); 53 + int (*init_ras_controller_interrupt)(struct amdgpu_device *adev); 54 + int (*init_ras_err_event_athub_interrupt)(struct amdgpu_device *adev); 55 + void (*query_ras_error_count)(struct amdgpu_device *adev, 56 + void *ras_error_status); 57 + int (*ras_late_init)(struct amdgpu_device *adev); 58 + void (*ras_fini)(struct amdgpu_device *adev); 59 + }; 60 + 50 61 struct amdgpu_nbio_funcs { 51 62 const struct nbio_hdp_flush_reg *hdp_flush_reg; 52 63 u32 (*get_hdp_flush_req_offset)(struct amdgpu_device *adev); ··· 90 79 void (*ih_control)(struct amdgpu_device *adev); 91 80 void (*init_registers)(struct amdgpu_device *adev); 92 81 void (*remap_hdp_registers)(struct amdgpu_device *adev); 93 - void (*handle_ras_controller_intr_no_bifring)(struct amdgpu_device *adev); 94 - void (*handle_ras_err_event_athub_intr_no_bifring)(struct amdgpu_device *adev); 95 - int (*init_ras_controller_interrupt)(struct amdgpu_device *adev); 96 - int (*init_ras_err_event_athub_interrupt)(struct amdgpu_device *adev); 97 - void (*query_ras_error_count)(struct amdgpu_device *adev, 98 - void *ras_error_status); 99 - int (*ras_late_init)(struct amdgpu_device *adev); 100 82 void (*enable_aspm)(struct amdgpu_device *adev, 101 83 bool enable); 102 84 void (*program_aspm)(struct amdgpu_device *adev); ··· 101 97 struct amdgpu_irq_src ras_err_event_athub_irq; 102 98 struct ras_common_if *ras_if; 103 99 const struct amdgpu_nbio_funcs *funcs; 100 + const struct amdgpu_nbio_ras_funcs *ras_funcs; 104 101 }; 105 102 106 103 int amdgpu_nbio_ras_late_init(struct amdgpu_device *adev);
+24 -6
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
··· 804 804 adev->mmhub.funcs->query_ras_error_status(adev); 805 805 break; 806 806 case AMDGPU_RAS_BLOCK__PCIE_BIF: 807 - if (adev->nbio.funcs->query_ras_error_count) 808 - adev->nbio.funcs->query_ras_error_count(adev, &err_data); 807 + if (adev->nbio.ras_funcs && 808 + adev->nbio.ras_funcs->query_ras_error_count) 809 + adev->nbio.ras_funcs->query_ras_error_count(adev, &err_data); 809 810 break; 810 811 case AMDGPU_RAS_BLOCK__XGMI_WAFL: 811 812 amdgpu_xgmi_query_ras_error_count(adev, &err_data); ··· 2031 2030 /* Might need get this flag from vbios. */ 2032 2031 con->flags = RAS_DEFAULT_FLAGS; 2033 2032 2034 - if (adev->nbio.funcs->init_ras_controller_interrupt) { 2035 - r = adev->nbio.funcs->init_ras_controller_interrupt(adev); 2033 + /* initialize nbio ras function ahead of any other 2034 + * ras functions so hardware fatal error interrupt 2035 + * can be enabled as early as possible */ 2036 + switch (adev->asic_type) { 2037 + case CHIP_VEGA20: 2038 + case CHIP_ARCTURUS: 2039 + case CHIP_ALDEBARAN: 2040 + if (!adev->gmc.xgmi.connected_to_cpu) 2041 + adev->nbio.ras_funcs = &nbio_v7_4_ras_funcs; 2042 + break; 2043 + default: 2044 + /* nbio ras is not available */ 2045 + break; 2046 + } 2047 + 2048 + if (adev->nbio.ras_funcs && 2049 + adev->nbio.ras_funcs->init_ras_controller_interrupt) { 2050 + r = adev->nbio.ras_funcs->init_ras_controller_interrupt(adev); 2036 2051 if (r) 2037 2052 goto release_con; 2038 2053 } 2039 2054 2040 - if (adev->nbio.funcs->init_ras_err_event_athub_interrupt) { 2041 - r = adev->nbio.funcs->init_ras_err_event_athub_interrupt(adev); 2055 + if (adev->nbio.ras_funcs && 2056 + adev->nbio.ras_funcs->init_ras_err_event_athub_interrupt) { 2057 + r = adev->nbio.ras_funcs->init_ras_err_event_athub_interrupt(adev); 2042 2058 if (r) 2043 2059 goto release_con; 2044 2060 }
+10 -6
drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
··· 557 557 DOORBELL_INTERRUPT_DISABLE, enable ? 0 : 1); 558 558 } 559 559 560 + const struct amdgpu_nbio_ras_funcs nbio_v7_4_ras_funcs = { 561 + .handle_ras_controller_intr_no_bifring = nbio_v7_4_handle_ras_controller_intr_no_bifring, 562 + .handle_ras_err_event_athub_intr_no_bifring = nbio_v7_4_handle_ras_err_event_athub_intr_no_bifring, 563 + .init_ras_controller_interrupt = nbio_v7_4_init_ras_controller_interrupt, 564 + .init_ras_err_event_athub_interrupt = nbio_v7_4_init_ras_err_event_athub_interrupt, 565 + .query_ras_error_count = nbio_v7_4_query_ras_error_count, 566 + .ras_late_init = amdgpu_nbio_ras_late_init, 567 + .ras_fini = amdgpu_nbio_ras_fini, 568 + }; 569 + 560 570 const struct amdgpu_nbio_funcs nbio_v7_4_funcs = { 561 571 .get_hdp_flush_req_offset = nbio_v7_4_get_hdp_flush_req_offset, 562 572 .get_hdp_flush_done_offset = nbio_v7_4_get_hdp_flush_done_offset, ··· 587 577 .ih_control = nbio_v7_4_ih_control, 588 578 .init_registers = nbio_v7_4_init_registers, 589 579 .remap_hdp_registers = nbio_v7_4_remap_hdp_registers, 590 - .handle_ras_controller_intr_no_bifring = nbio_v7_4_handle_ras_controller_intr_no_bifring, 591 - .handle_ras_err_event_athub_intr_no_bifring = nbio_v7_4_handle_ras_err_event_athub_intr_no_bifring, 592 - .init_ras_controller_interrupt = nbio_v7_4_init_ras_controller_interrupt, 593 - .init_ras_err_event_athub_interrupt = nbio_v7_4_init_ras_err_event_athub_interrupt, 594 - .query_ras_error_count = nbio_v7_4_query_ras_error_count, 595 - .ras_late_init = amdgpu_nbio_ras_late_init, 596 580 };
+1
drivers/gpu/drm/amd/amdgpu/nbio_v7_4.h
··· 28 28 29 29 extern const struct nbio_hdp_flush_reg nbio_v7_4_hdp_flush_reg; 30 30 extern const struct amdgpu_nbio_funcs nbio_v7_4_funcs; 31 + extern const struct amdgpu_nbio_ras_funcs nbio_v7_4_ras_funcs; 31 32 32 33 #endif
+10 -5
drivers/gpu/drm/amd/amdgpu/soc15.c
··· 1523 1523 if (adev->hdp.funcs->reset_ras_error_count) 1524 1524 adev->hdp.funcs->reset_ras_error_count(adev); 1525 1525 1526 - if (adev->nbio.funcs->ras_late_init) 1527 - r = adev->nbio.funcs->ras_late_init(adev); 1526 + if (adev->nbio.ras_funcs && 1527 + adev->nbio.ras_funcs->ras_late_init) 1528 + r = adev->nbio.ras_funcs->ras_late_init(adev); 1528 1529 1529 1530 return r; 1530 1531 } ··· 1546 1545 { 1547 1546 struct amdgpu_device *adev = (struct amdgpu_device *)handle; 1548 1547 1549 - amdgpu_nbio_ras_fini(adev); 1548 + if (adev->nbio.ras_funcs && 1549 + adev->nbio.ras_funcs->ras_fini) 1550 + adev->nbio.ras_funcs->ras_fini(adev); 1550 1551 adev->df.funcs->sw_fini(adev); 1551 1552 return 0; 1552 1553 } ··· 1612 1609 1613 1610 if (adev->nbio.ras_if && 1614 1611 amdgpu_ras_is_supported(adev, adev->nbio.ras_if->block)) { 1615 - if (adev->nbio.funcs->init_ras_controller_interrupt) 1612 + if (adev->nbio.ras_funcs && 1613 + adev->nbio.ras_funcs->init_ras_controller_interrupt) 1616 1614 amdgpu_irq_put(adev, &adev->nbio.ras_controller_irq, 0); 1617 - if (adev->nbio.funcs->init_ras_err_event_athub_interrupt) 1615 + if (adev->nbio.ras_funcs && 1616 + adev->nbio.ras_funcs->init_ras_err_event_athub_interrupt) 1618 1617 amdgpu_irq_put(adev, &adev->nbio.ras_err_event_athub_irq, 0); 1619 1618 } 1620 1619