Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/vc4: Add T-format scanout support.

The T tiling format is what V3D uses for textures, with no raster
support at all until later revisions of the hardware (and always at a
large 3D performance penalty). If we can't scan out V3D's format,
then we often need to do a relayout at some stage of the pipeline,
either right before texturing from the scanout buffer (common in X11
without a compositor) or between a tiled screen buffer right before
scanout (an option I've considered in trying to resolve this
inconsistency, but which means needing to use the dirty fb ioctl and
having some update policy).

T-format scanout lets us avoid either of those shadow copies, for a
massive, obvious performance improvement to X11 window dragging
without a compositor. Unfortunately, enabling a compositor to work
around the discrepancy has turned out to be too costly in memory
consumption for the Raspbian distribution.

Because the HVS operates a scanline at a time, compositing from T does
increase the memory bandwidth cost of scanout. On my 1920x1080@32bpp
display on a RPi3, we go from about 15% of system memory bandwidth
with linear to about 20% with tiled. However, for X11 this still ends
up being a huge performance win in active usage.

This patch doesn't yet handle src_x/src_y offsetting within the tiled
buffer. However, we fail to do so for untiled buffers already.

Signed-off-by: Eric Anholt <eric@anholt.net>
Link: http://patchwork.freedesktop.org/patch/msgid/20170608001336.12842-1-eric@anholt.net
Reviewed-by: Boris Brezillon <boris.brezillon@free-electrons.com>

+68 -5
+27 -4
drivers/gpu/drm/vc4/vc4_plane.c
··· 500 500 u32 ctl0_offset = vc4_state->dlist_count; 501 501 const struct hvs_format *format = vc4_get_hvs_format(fb->format->format); 502 502 int num_planes = drm_format_num_planes(format->drm); 503 - u32 scl0, scl1; 504 - u32 lbm_size; 503 + u32 scl0, scl1, pitch0; 504 + u32 lbm_size, tiling; 505 505 unsigned long irqflags; 506 506 int ret, i; 507 507 ··· 542 542 scl1 = vc4_get_scl_field(state, 0); 543 543 } 544 544 545 + switch (fb->modifier) { 546 + case DRM_FORMAT_MOD_LINEAR: 547 + tiling = SCALER_CTL0_TILING_LINEAR; 548 + pitch0 = VC4_SET_FIELD(fb->pitches[0], SCALER_SRC_PITCH); 549 + break; 550 + case DRM_FORMAT_MOD_BROADCOM_VC4_T_TILED: 551 + tiling = SCALER_CTL0_TILING_256B_OR_T; 552 + 553 + pitch0 = (VC4_SET_FIELD(0, SCALER_PITCH0_TILE_Y_OFFSET), 554 + VC4_SET_FIELD(0, SCALER_PITCH0_TILE_WIDTH_L), 555 + VC4_SET_FIELD((vc4_state->src_w[0] + 31) >> 5, 556 + SCALER_PITCH0_TILE_WIDTH_R)); 557 + break; 558 + default: 559 + DRM_DEBUG_KMS("Unsupported FB tiling flag 0x%16llx", 560 + (long long)fb->modifier); 561 + return -EINVAL; 562 + } 563 + 545 564 /* Control word */ 546 565 vc4_dlist_write(vc4_state, 547 566 SCALER_CTL0_VALID | 548 567 (format->pixel_order << SCALER_CTL0_ORDER_SHIFT) | 549 568 (format->hvs << SCALER_CTL0_PIXEL_FORMAT_SHIFT) | 569 + VC4_SET_FIELD(tiling, SCALER_CTL0_TILING) | 550 570 (vc4_state->is_unity ? SCALER_CTL0_UNITY : 0) | 551 571 VC4_SET_FIELD(scl0, SCALER_CTL0_SCL0) | 552 572 VC4_SET_FIELD(scl1, SCALER_CTL0_SCL1)); ··· 620 600 for (i = 0; i < num_planes; i++) 621 601 vc4_dlist_write(vc4_state, 0xc0c0c0c0); 622 602 623 - /* Pitch word 0/1/2 */ 624 - for (i = 0; i < num_planes; i++) { 603 + /* Pitch word 0 */ 604 + vc4_dlist_write(vc4_state, pitch0); 605 + 606 + /* Pitch word 1/2 */ 607 + for (i = 1; i < num_planes; i++) { 625 608 vc4_dlist_write(vc4_state, 626 609 VC4_SET_FIELD(fb->pitches[i], SCALER_SRC_PITCH)); 627 610 }
+19
drivers/gpu/drm/vc4/vc4_regs.h
··· 709 709 #define SCALER_CTL0_SIZE_MASK VC4_MASK(29, 24) 710 710 #define SCALER_CTL0_SIZE_SHIFT 24 711 711 712 + #define SCALER_CTL0_TILING_MASK VC4_MASK(21, 20) 713 + #define SCALER_CTL0_TILING_SHIFT 20 714 + #define SCALER_CTL0_TILING_LINEAR 0 715 + #define SCALER_CTL0_TILING_64B 1 716 + #define SCALER_CTL0_TILING_128B 2 717 + #define SCALER_CTL0_TILING_256B_OR_T 3 718 + 712 719 #define SCALER_CTL0_HFLIP BIT(16) 713 720 #define SCALER_CTL0_VFLIP BIT(15) 714 721 ··· 845 838 #define SCALER_PPF_KERNEL_OFFSET_SHIFT 0 846 839 #define SCALER_PPF_KERNEL_UNCACHED BIT(31) 847 840 841 + /* PITCH0/1/2 fields for raster. */ 848 842 #define SCALER_SRC_PITCH_MASK VC4_MASK(15, 0) 849 843 #define SCALER_SRC_PITCH_SHIFT 0 844 + 845 + /* PITCH0 fields for T-tiled. */ 846 + #define SCALER_PITCH0_TILE_WIDTH_L_MASK VC4_MASK(22, 16) 847 + #define SCALER_PITCH0_TILE_WIDTH_L_SHIFT 16 848 + #define SCALER_PITCH0_TILE_LINE_DIR BIT(15) 849 + #define SCALER_PITCH0_TILE_INITIAL_LINE_DIR BIT(14) 850 + /* Y offset within a tile. */ 851 + #define SCALER_PITCH0_TILE_Y_OFFSET_MASK VC4_MASK(13, 7) 852 + #define SCALER_PITCH0_TILE_Y_OFFSET_SHIFT 7 853 + #define SCALER_PITCH0_TILE_WIDTH_R_MASK VC4_MASK(6, 0) 854 + #define SCALER_PITCH0_TILE_WIDTH_R_SHIFT 0 850 855 851 856 #endif /* VC4_REGS_H */
+22 -1
include/uapi/drm/drm_fourcc.h
··· 182 182 #define DRM_FORMAT_MOD_VENDOR_SAMSUNG 0x04 183 183 #define DRM_FORMAT_MOD_VENDOR_QCOM 0x05 184 184 #define DRM_FORMAT_MOD_VENDOR_VIVANTE 0x06 185 + #define DRM_FORMAT_MOD_VENDOR_BROADCOM 0x07 185 186 /* add more to the end as needed */ 186 187 187 188 #define fourcc_mod_code(vendor, val) \ ··· 307 306 */ 308 307 #define DRM_FORMAT_MOD_VIVANTE_SPLIT_SUPER_TILED fourcc_mod_code(VIVANTE, 4) 309 308 310 - 311 309 /* NVIDIA Tegra frame buffer modifiers */ 312 310 313 311 /* ··· 350 350 * in full detail. 351 351 */ 352 352 #define NV_FORMAT_MOD_TEGRA_16BX2_BLOCK(v) fourcc_mod_tegra_code(2, v) 353 + 354 + /* 355 + * Broadcom VC4 "T" format 356 + * 357 + * This is the primary layout that the V3D GPU can texture from (it 358 + * can't do linear). The T format has: 359 + * 360 + * - 64b utiles of pixels in a raster-order grid according to cpp. It's 4x4 361 + * pixels at 32 bit depth. 362 + * 363 + * - 1k subtiles made of a 4x4 raster-order grid of 64b utiles (so usually 364 + * 16x16 pixels). 365 + * 366 + * - 4k tiles made of a 2x2 grid of 1k subtiles (so usually 32x32 pixels). On 367 + * even 4k tile rows, they're arranged as (BL, TL, TR, BR), and on odd rows 368 + * they're (TR, BR, BL, TL), where bottom left is start of memory. 369 + * 370 + * - an image made of 4k tiles in rows either left-to-right (even rows of 4k 371 + * tiles) or right-to-left (odd rows of 4k tiles). 372 + */ 373 + #define DRM_FORMAT_MOD_BROADCOM_VC4_T_TILED fourcc_mod_code(BROADCOM, 1) 353 374 354 375 #if defined(__cplusplus) 355 376 }