Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdgpu: load balance VCN3 decode as well v8

Add VCN3 IB parsing to figure out to which instance we can send the
stream for decode.

v2: remove VCN instance limit as well, fix amdgpu_cs_find_mapping,
check supported formats instead of unsupported.
v3: fix typo and error handling
v4: make sure the message BO is CPU accessible
v5: fix addr calculation once more
v6: only check message buffers
v7: fix constant and use defines
v8: fix create msg calculation

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Sonny Jiang <sonny.jiang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Christian König and committed by
Alex Deucher
87cc7f9e c62dfdbb

+130 -2
+130 -2
drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c
··· 50 50 #define VCN_INSTANCES_SIENNA_CICHLID 2 51 51 #define DEC_SW_RING_ENABLED FALSE 52 52 53 + #define RDECODE_MSG_CREATE 0x00000000 54 + #define RDECODE_MESSAGE_CREATE 0x00000001 55 + 53 56 static int amdgpu_ih_clientid_vcns[] = { 54 57 SOC15_IH_CLIENTID_VCN, 55 58 SOC15_IH_CLIENTID_VCN1 ··· 211 208 } else { 212 209 ring->doorbell_index = (adev->doorbell_index.vcn.vcn_ring0_1 << 1) + 8 * i; 213 210 } 214 - if (adev->asic_type == CHIP_SIENNA_CICHLID && i != 0) 215 - ring->no_scheduler = true; 216 211 sprintf(ring->name, "vcn_dec_%d", i); 217 212 r = amdgpu_ring_init(adev, ring, 512, &adev->vcn.inst[i].irq, 0, 218 213 AMDGPU_RING_PRIO_DEFAULT, ··· 1848 1847 .emit_reg_write_reg_wait = amdgpu_ring_emit_reg_write_reg_wait_helper, 1849 1848 }; 1850 1849 1850 + static int vcn_v3_0_limit_sched(struct amdgpu_cs_parser *p) 1851 + { 1852 + struct drm_gpu_scheduler **scheds; 1853 + 1854 + /* The create msg must be in the first IB submitted */ 1855 + if (atomic_read(&p->entity->fence_seq)) 1856 + return -EINVAL; 1857 + 1858 + scheds = p->adev->gpu_sched[AMDGPU_HW_IP_VCN_DEC] 1859 + [AMDGPU_RING_PRIO_DEFAULT].sched; 1860 + drm_sched_entity_modify_sched(p->entity, scheds, 1); 1861 + return 0; 1862 + } 1863 + 1864 + static int vcn_v3_0_dec_msg(struct amdgpu_cs_parser *p, uint64_t addr) 1865 + { 1866 + struct ttm_operation_ctx ctx = { false, false }; 1867 + struct amdgpu_bo_va_mapping *map; 1868 + uint32_t *msg, num_buffers; 1869 + struct amdgpu_bo *bo; 1870 + uint64_t start, end; 1871 + unsigned int i; 1872 + void * ptr; 1873 + int r; 1874 + 1875 + addr &= AMDGPU_GMC_HOLE_MASK; 1876 + r = amdgpu_cs_find_mapping(p, addr, &bo, &map); 1877 + if (r) { 1878 + DRM_ERROR("Can't find BO for addr 0x%08Lx\n", addr); 1879 + return r; 1880 + } 1881 + 1882 + start = map->start * AMDGPU_GPU_PAGE_SIZE; 1883 + end = (map->last + 1) * AMDGPU_GPU_PAGE_SIZE; 1884 + if (addr & 0x7) { 1885 + DRM_ERROR("VCN messages must be 8 byte aligned!\n"); 1886 + return -EINVAL; 1887 + } 1888 + 1889 + bo->flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED; 1890 + amdgpu_bo_placement_from_domain(bo, bo->allowed_domains); 1891 + r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); 1892 + if (r) { 1893 + DRM_ERROR("Failed validating the VCN message BO (%d)!\n", r); 1894 + return r; 1895 + } 1896 + 1897 + r = amdgpu_bo_kmap(bo, &ptr); 1898 + if (r) { 1899 + DRM_ERROR("Failed mapping the VCN message (%d)!\n", r); 1900 + return r; 1901 + } 1902 + 1903 + msg = ptr + addr - start; 1904 + 1905 + /* Check length */ 1906 + if (msg[1] > end - addr) { 1907 + r = -EINVAL; 1908 + goto out; 1909 + } 1910 + 1911 + if (msg[3] != RDECODE_MSG_CREATE) 1912 + goto out; 1913 + 1914 + num_buffers = msg[2]; 1915 + for (i = 0, msg = &msg[6]; i < num_buffers; ++i, msg += 4) { 1916 + uint32_t offset, size, *create; 1917 + 1918 + if (msg[0] != RDECODE_MESSAGE_CREATE) 1919 + continue; 1920 + 1921 + offset = msg[1]; 1922 + size = msg[2]; 1923 + 1924 + if (offset + size > end) { 1925 + r = -EINVAL; 1926 + goto out; 1927 + } 1928 + 1929 + create = ptr + addr + offset - start; 1930 + 1931 + /* H246, HEVC and VP9 can run on any instance */ 1932 + if (create[0] == 0x7 || create[0] == 0x10 || create[0] == 0x11) 1933 + continue; 1934 + 1935 + r = vcn_v3_0_limit_sched(p); 1936 + if (r) 1937 + goto out; 1938 + } 1939 + 1940 + out: 1941 + amdgpu_bo_kunmap(bo); 1942 + return r; 1943 + } 1944 + 1945 + static int vcn_v3_0_ring_patch_cs_in_place(struct amdgpu_cs_parser *p, 1946 + uint32_t ib_idx) 1947 + { 1948 + struct amdgpu_ring *ring = to_amdgpu_ring(p->entity->rq->sched); 1949 + struct amdgpu_ib *ib = &p->job->ibs[ib_idx]; 1950 + uint32_t msg_lo = 0, msg_hi = 0; 1951 + unsigned i; 1952 + int r; 1953 + 1954 + /* The first instance can decode anything */ 1955 + if (!ring->me) 1956 + return 0; 1957 + 1958 + for (i = 0; i < ib->length_dw; i += 2) { 1959 + uint32_t reg = amdgpu_get_ib_value(p, ib_idx, i); 1960 + uint32_t val = amdgpu_get_ib_value(p, ib_idx, i + 1); 1961 + 1962 + if (reg == PACKET0(p->adev->vcn.internal.data0, 0)) { 1963 + msg_lo = val; 1964 + } else if (reg == PACKET0(p->adev->vcn.internal.data1, 0)) { 1965 + msg_hi = val; 1966 + } else if (reg == PACKET0(p->adev->vcn.internal.cmd, 0) && 1967 + val == 0) { 1968 + r = vcn_v3_0_dec_msg(p, ((u64)msg_hi) << 32 | msg_lo); 1969 + if (r) 1970 + return r; 1971 + } 1972 + } 1973 + return 0; 1974 + } 1975 + 1851 1976 static const struct amdgpu_ring_funcs vcn_v3_0_dec_ring_vm_funcs = { 1852 1977 .type = AMDGPU_RING_TYPE_VCN_DEC, 1853 1978 .align_mask = 0xf, ··· 1981 1854 .get_rptr = vcn_v3_0_dec_ring_get_rptr, 1982 1855 .get_wptr = vcn_v3_0_dec_ring_get_wptr, 1983 1856 .set_wptr = vcn_v3_0_dec_ring_set_wptr, 1857 + .patch_cs_in_place = vcn_v3_0_ring_patch_cs_in_place, 1984 1858 .emit_frame_size = 1985 1859 SOC15_FLUSH_GPU_TLB_NUM_WREG * 6 + 1986 1860 SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 8 +