Remove ARM32 assembly/pixman blitters

-62

CMakeLists.txt

··· 292 292 dep_option(SDL_ALTIVEC "Use Altivec assembly routines" ON "SDL_ASSEMBLY;SDL_CPU_POWERPC32 OR SDL_CPU_POWERPC64" OFF) 293 293 dep_option(SDL_ARMSIMD "Use SIMD assembly blitters on ARM" OFF "SDL_ASSEMBLY;SDL_CPU_ARM32" OFF) 294 294 dep_option(SDL_ARMNEON "Use NEON assembly routines" ON "SDL_ASSEMBLY;SDL_CPU_ARM32 OR SDL_CPU_ARM64" OFF) 295 - dep_option(SDL_ARMNEON_BLITTERS "Use NEON assembly blitters on ARM32" OFF "SDL_VIDEO;SDL_ASSEMBLY;SDL_ARMNEON;SDL_CPU_ARM32" OFF) 296 295 dep_option(SDL_LSX "Use LSX assembly routines" ON "SDL_ASSEMBLY;SDL_CPU_LOONGARCH64" OFF) 297 296 dep_option(SDL_LASX "Use LASX assembly routines" ON "SDL_ASSEMBLY;SDL_CPU_LOONGARCH64" OFF) 298 297 ··· 880 879 cmake_pop_check_state() 881 880 if(COMPILER_SUPPORTS_LASX AND HAVE_LASXINTRIN_H) 882 881 set(HAVE_LASX TRUE) 883 - endif() 884 - endif() 885 - 886 - if(SDL_ARMSIMD) 887 - cmake_push_check_state() 888 - string(APPEND CMAKE_REQUIRED_FLAGS " -x assembler-with-cpp") 889 - list(APPEND CMAKE_REQUIRED_LINK_OPTIONS -x none) 890 - check_c_source_compiles(" 891 - .text 892 - .arch armv6 893 - .object_arch armv4 894 - .arm 895 - .altmacro 896 - #ifndef __ARM_EABI__ 897 - #error EABI is required (to be sure that calling conventions are compatible) 898 - #endif 899 - main: 900 - .global main 901 - pld [r0] 902 - uqadd8 r0, r0, r0 903 - " ARMSIMD_FOUND) 904 - cmake_pop_check_state() 905 - 906 - if(ARMSIMD_FOUND) 907 - set(HAVE_ARMSIMD TRUE) 908 - set(SDL_ARM_SIMD_BLITTERS 1) 909 - enable_language(ASM) 910 - sdl_glob_sources("${SDL3_SOURCE_DIR}/src/video/arm/pixman-arm-simd*.S") 911 - set_property(SOURCE ${ARMSIMD_SOURCES} APPEND PROPERTY COMPILE_OPTIONS -x assembler-with-cpp) 912 - set(WARN_ABOUT_ARM_SIMD_ASM_MIT TRUE) 913 - endif() 914 - endif() 915 - 916 - if(SDL_ARMNEON_BLITTERS) 917 - cmake_push_check_state() 918 - string(APPEND CMAKE_REQUIRED_FLAGS " -x assembler-with-cpp") 919 - list(APPEND CMAKE_REQUIRED_LINK_OPTIONS -x none) 920 - check_c_source_compiles(" 921 - .text 922 - .fpu neon 923 - .arch armv7a 924 - .object_arch armv4 925 - .eabi_attribute 10, 0 926 - .arm 927 - .altmacro 928 - #ifndef __ARM_EABI__ 929 - #error EABI is required (to be sure that calling conventions are compatible) 930 - #endif 931 - main: 932 - .global main 933 - pld [r0] 934 - vmovn.u16 d0, q0 935 - " COMPILER_SUPPORTS_ARMNEON_ASSEMBLY) 936 - cmake_pop_check_state() 937 - if(COMPILER_SUPPORTS_ARMNEON_ASSEMBLY) 938 - set(HAVE_ARMNEON_BLITTERS TRUE) 939 - set(SDL_ARM_NEON_BLITTERS 1) 940 - enable_language(ASM) 941 - sdl_glob_sources("${SDL3_SOURCE_DIR}/src/video/arm/pixman-arm-neon*.S") 942 - set_property(SOURCE ${ARMNEON_SOURCES} APPEND PROPERTY COMPILE_OPTIONS -x assembler-with-cpp) 943 - set(WARN_ABOUT_ARM_NEON_ASM_MIT TRUE) 944 882 endif() 945 883 endif() 946 884

-4

cmake/3rdparty.cmake

··· 25 25 # HIDAPI Steam controller 26 26 "controller_constants.h" 27 27 "controller_structs.h" 28 - # Nokia Pixman 29 - "pixman-arm-asm.h" 30 - "pixman-arm-neon-asm.h" 31 - "pixman-arm-simd-asm.h" 32 28 # YUV2RGB 33 29 "yuv_rgb.c" 34 30 "yuv_rgb_lsx_func.h"

-6

src/video/SDL_blit.h

··· 23 23 #ifndef SDL_blit_h_ 24 24 #define SDL_blit_h_ 25 25 26 - /* pixman ARM blitters are 32 bit only : */ 27 - #if defined(__aarch64__) || defined(_M_ARM64) 28 - #undef SDL_ARM_SIMD_BLITTERS 29 - #undef SDL_ARM_NEON_BLITTERS 30 - #endif 31 - 32 26 /* Table to do pixel byte expansion */ 33 27 extern const Uint8 *SDL_expand_byte[9]; 34 28 extern const Uint16 SDL_expand_byte_10[];

+1 -87

src/video/SDL_blit_A.c

··· 421 421 422 422 #endif /* SDL_MMX_INTRINSICS */ 423 423 424 - #ifdef SDL_ARM_SIMD_BLITTERS 425 - void BlitARGBto565PixelAlphaARMSIMDAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride); 426 - 427 - static void BlitARGBto565PixelAlphaARMSIMD(SDL_BlitInfo *info) 428 - { 429 - int32_t width = info->dst_w; 430 - int32_t height = info->dst_h; 431 - uint16_t *dstp = (uint16_t *)info->dst; 432 - int32_t dststride = width + (info->dst_skip >> 1); 433 - uint32_t *srcp = (uint32_t *)info->src; 434 - int32_t srcstride = width + (info->src_skip >> 2); 435 - 436 - BlitARGBto565PixelAlphaARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride); 437 - } 438 - 439 - void BlitRGBtoRGBPixelAlphaARMSIMDAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride); 440 - 441 - static void BlitRGBtoRGBPixelAlphaARMSIMD(SDL_BlitInfo *info) 442 - { 443 - int32_t width = info->dst_w; 444 - int32_t height = info->dst_h; 445 - uint32_t *dstp = (uint32_t *)info->dst; 446 - int32_t dststride = width + (info->dst_skip >> 2); 447 - uint32_t *srcp = (uint32_t *)info->src; 448 - int32_t srcstride = width + (info->src_skip >> 2); 449 - 450 - BlitRGBtoRGBPixelAlphaARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride); 451 - } 452 - #endif 453 - 454 - #ifdef SDL_ARM_NEON_BLITTERS 455 - void BlitARGBto565PixelAlphaARMNEONAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride); 456 - 457 - static void BlitARGBto565PixelAlphaARMNEON(SDL_BlitInfo *info) 458 - { 459 - int32_t width = info->dst_w; 460 - int32_t height = info->dst_h; 461 - uint16_t *dstp = (uint16_t *)info->dst; 462 - int32_t dststride = width + (info->dst_skip >> 1); 463 - uint32_t *srcp = (uint32_t *)info->src; 464 - int32_t srcstride = width + (info->src_skip >> 2); 465 - 466 - BlitARGBto565PixelAlphaARMNEONAsm(width, height, dstp, dststride, srcp, srcstride); 467 - } 468 - 469 - void BlitRGBtoRGBPixelAlphaARMNEONAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride); 470 - 471 - static void BlitRGBtoRGBPixelAlphaARMNEON(SDL_BlitInfo *info) 472 - { 473 - int32_t width = info->dst_w; 474 - int32_t height = info->dst_h; 475 - uint32_t *dstp = (uint32_t *)info->dst; 476 - int32_t dststride = width + (info->dst_skip >> 2); 477 - uint32_t *srcp = (uint32_t *)info->src; 478 - int32_t srcstride = width + (info->src_skip >> 2); 479 - 480 - BlitRGBtoRGBPixelAlphaARMNEONAsm(width, height, dstp, dststride, srcp, srcstride); 481 - } 482 - #endif 483 - 484 424 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */ 485 425 static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info) 486 426 { ··· 1274 1214 } 1275 1215 1276 1216 case 2: 1277 - #if defined(SDL_ARM_NEON_BLITTERS) || defined(SDL_ARM_SIMD_BLITTERS) 1278 - if (sf->bytes_per_pixel == 4 && sf->Amask == 0xff000000 && sf->Gmask == 0xff00 && df->Gmask == 0x7e0 && ((sf->Rmask == 0xff && df->Rmask == 0x1f) || (sf->Bmask == 0xff && df->Bmask == 0x1f))) { 1279 - #ifdef SDL_ARM_NEON_BLITTERS 1280 - if (SDL_HasNEON()) { 1281 - return BlitARGBto565PixelAlphaARMNEON; 1282 - } 1283 - #endif 1284 - #ifdef SDL_ARM_SIMD_BLITTERS 1285 - if (SDL_HasARMSIMD()) { 1286 - return BlitARGBto565PixelAlphaARMSIMD; 1287 - } 1288 - #endif 1289 - } 1290 - #endif 1291 - if (sf->bytes_per_pixel == 4 && sf->Amask == 0xff000000 && sf->Gmask == 0xff00 && ((sf->Rmask == 0xff && df->Rmask == 0x1f) || (sf->Bmask == 0xff && df->Bmask == 0x1f))) { 1217 + if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000 && sf->Gmask == 0xff00 && ((sf->Rmask == 0xff && df->Rmask == 0x1f) || (sf->Bmask == 0xff && df->Bmask == 0x1f))) { 1292 1218 if (df->Gmask == 0x7e0) { 1293 1219 return BlitARGBto565PixelAlpha; 1294 1220 } else if (df->Gmask == 0x3e0) { ··· 1311 1237 } 1312 1238 } 1313 1239 #endif /* SDL_MMX_INTRINSICS */ 1314 - if (sf->Amask == 0xff000000) { 1315 - #ifdef SDL_ARM_NEON_BLITTERS 1316 - if (SDL_HasNEON()) { 1317 - return BlitRGBtoRGBPixelAlphaARMNEON; 1318 - } 1319 - #endif 1320 - #ifdef SDL_ARM_SIMD_BLITTERS 1321 - if (SDL_HasARMSIMD()) { 1322 - return BlitRGBtoRGBPixelAlphaARMSIMD; 1323 - } 1324 - #endif 1325 - } 1326 1240 } 1327 1241 return BlitNtoNPixelAlpha; 1328 1242

+2 -81

src/video/SDL_fillrect.c

··· 247 247 return SDL_FillSurfaceRects(dst, rect, 1, color); 248 248 } 249 249 250 - #ifdef SDL_ARM_NEON_BLITTERS 251 - void FillSurfaceRect8ARMNEONAsm(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src); 252 - void FillSurfaceRect16ARMNEONAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint16_t src); 253 - void FillSurfaceRect32ARMNEONAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t src); 254 - 255 - static void fill_8_neon(Uint8 *pixels, int pitch, Uint32 color, int w, int h) 256 - { 257 - FillSurfaceRect8ARMNEONAsm(w, h, (uint8_t *)pixels, pitch >> 0, color); 258 - return; 259 - } 260 - 261 - static void fill_16_neon(Uint8 *pixels, int pitch, Uint32 color, int w, int h) 262 - { 263 - FillSurfaceRect16ARMNEONAsm(w, h, (uint16_t *)pixels, pitch >> 1, color); 264 - return; 265 - } 266 - 267 - static void fill_32_neon(Uint8 *pixels, int pitch, Uint32 color, int w, int h) 268 - { 269 - FillSurfaceRect32ARMNEONAsm(w, h, (uint32_t *)pixels, pitch >> 2, color); 270 - return; 271 - } 272 - #endif 273 - 274 - #ifdef SDL_ARM_SIMD_BLITTERS 275 - void FillSurfaceRect8ARMSIMDAsm(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src); 276 - void FillSurfaceRect16ARMSIMDAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint16_t src); 277 - void FillSurfaceRect32ARMSIMDAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t src); 278 - 279 - static void fill_8_simd(Uint8 *pixels, int pitch, Uint32 color, int w, int h) 280 - { 281 - FillSurfaceRect8ARMSIMDAsm(w, h, (uint8_t *)pixels, pitch >> 0, color); 282 - return; 283 - } 284 - 285 - static void fill_16_simd(Uint8 *pixels, int pitch, Uint32 color, int w, int h) 286 - { 287 - FillSurfaceRect16ARMSIMDAsm(w, h, (uint16_t *)pixels, pitch >> 1, color); 288 - return; 289 - } 290 - 291 - static void fill_32_simd(Uint8 *pixels, int pitch, Uint32 color, int w, int h) 292 - { 293 - FillSurfaceRect32ARMSIMDAsm(w, h, (uint32_t *)pixels, pitch >> 2, color); 294 - return; 295 - } 296 - #endif 297 - 298 250 int SDL_FillSurfaceRects(SDL_Surface *dst, const SDL_Rect *rects, int count, 299 251 Uint32 color) 300 252 { ··· 339 291 return SDL_SetError("SDL_FillSurfaceRects(): Unsupported surface format"); 340 292 } 341 293 342 - #ifdef SDL_ARM_NEON_BLITTERS 343 - if (SDL_HasNEON() && dst->format->bytes_per_pixel != 3 && !fill_function) { 344 - switch (dst->format->bytes_per_pixel) { 345 - case 1: 346 - fill_function = fill_8_neon; 347 - break; 348 - case 2: 349 - fill_function = fill_16_neon; 350 - break; 351 - case 4: 352 - fill_function = fill_32_neon; 353 - break; 354 - } 355 - } 356 - #endif 357 - #ifdef SDL_ARM_SIMD_BLITTERS 358 - if (SDL_HasARMSIMD() && dst->format->bytes_per_pixel != 3 && !fill_function) { 359 - switch (dst->format->bytes_per_pixel) { 360 - case 1: 361 - fill_function = fill_8_simd; 362 - break; 363 - case 2: 364 - fill_function = fill_16_simd; 365 - break; 366 - case 4: 367 - fill_function = fill_32_simd; 368 - break; 369 - } 370 - } 371 - #endif 372 - 373 - if (!fill_function) { 374 - switch (dst->format->bytes_per_pixel) { 294 + if (fill_function == NULL) { 295 + switch (dst->format->BytesPerPixel) { 375 296 case 1: 376 297 { 377 298 color |= (color << 8);

-36

src/video/arm/pixman-arm-asm.h

··· 1 - /* 2 - * Copyright © 2010 Nokia Corporation 3 - * 4 - * Permission to use, copy, modify, distribute, and sell this software and its 5 - * documentation for any purpose is hereby granted without fee, provided that 6 - * the above copyright notice appear in all copies and that both that 7 - * copyright notice and this permission notice appear in supporting 8 - * documentation, and that the name of Mozilla Corporation not be used in 9 - * advertising or publicity pertaining to distribution of the software without 10 - * specific, written prior permission. Mozilla Corporation makes no 11 - * representations about the suitability of this software for any purpose. It 12 - * is provided "as is" without express or implied warranty. 13 - * 14 - * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS 15 - * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 16 - * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY 17 - * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 18 - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN 19 - * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING 20 - * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS 21 - * SOFTWARE. 22 - * 23 - * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) 24 - * 25 - */ 26 - 27 - /* Supplementary macro for setting function attributes */ 28 - .macro pixman_asm_function fname 29 - .func fname 30 - .global fname 31 - #ifdef __ELF__ 32 - .hidden fname 33 - .type fname, %function 34 - #endif 35 - fname: 36 - .endm

-375

src/video/arm/pixman-arm-neon-asm.S

··· 1 - /* 2 - * Copyright © 2009 Nokia Corporation 3 - * 4 - * Permission is hereby granted, free of charge, to any person obtaining a 5 - * copy of this software and associated documentation files (the "Software"), 6 - * to deal in the Software without restriction, including without limitation 7 - * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 - * and/or sell copies of the Software, and to permit persons to whom the 9 - * Software is furnished to do so, subject to the following conditions: 10 - * 11 - * The above copyright notice and this permission notice (including the next 12 - * paragraph) shall be included in all copies or substantial portions of the 13 - * Software. 14 - * 15 - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 - * DEALINGS IN THE SOFTWARE. 22 - * 23 - * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) 24 - */ 25 - 26 - /* 27 - * Copyright (c) 2018 RISC OS Open Ltd 28 - * 29 - * This software is provided 'as-is', without any express or implied 30 - * warranty. In no event will the authors be held liable for any damages 31 - * arising from the use of this software. 32 - * 33 - * Permission is granted to anyone to use this software for any purpose, 34 - * including commercial applications, and to alter it and redistribute it 35 - * freely, subject to the following restrictions: 36 - * 37 - * 1. The origin of this software must not be misrepresented; you must not 38 - * claim that you wrote the original software. If you use this software 39 - * in a product, an acknowledgment in the product documentation would be 40 - * appreciated but is not required. 41 - * 2. Altered source versions must be plainly marked as such, and must not be 42 - * misrepresented as being the original software. 43 - * 3. This notice may not be removed or altered from any source distribution. 44 - */ 45 - 46 - /* Prevent the stack from becoming executable for no reason... */ 47 - #if defined(__linux__) && defined(__ELF__) 48 - .section .note.GNU-stack,"",%progbits 49 - #endif 50 - 51 - .text 52 - .fpu neon 53 - .arch armv7a 54 - .object_arch armv4 55 - .eabi_attribute 10, 0 /* suppress Tag_FP_arch */ 56 - .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */ 57 - .arm 58 - .altmacro 59 - .p2align 2 60 - 61 - #include "pixman-arm-asm.h" 62 - #include "pixman-arm-neon-asm.h" 63 - 64 - /* Global configuration options and preferences */ 65 - 66 - /* 67 - * The code can optionally make use of unaligned memory accesses to improve 68 - * performance of handling leading/trailing pixels for each scanline. 69 - * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for 70 - * example in linux if unaligned memory accesses are not configured to 71 - * generate.exceptions. 72 - */ 73 - .set RESPECT_STRICT_ALIGNMENT, 1 74 - 75 - /* 76 - * Set default prefetch type. There is a choice between the following options: 77 - * 78 - * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work 79 - * as NOP to workaround some HW bugs or for whatever other reason) 80 - * 81 - * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where 82 - * advanced prefetch introduces heavy overhead) 83 - * 84 - * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8 85 - * which can run ARM and NEON instructions simultaneously so that extra ARM 86 - * instructions do not add (many) extra cycles, but improve prefetch efficiency) 87 - * 88 - * Note: some types of function can't support advanced prefetch and fallback 89 - * to simple one (those which handle 24bpp pixels) 90 - */ 91 - .set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED 92 - 93 - /* Prefetch distance in pixels for simple prefetch */ 94 - .set PREFETCH_DISTANCE_SIMPLE, 64 95 - 96 - /******************************************************************************/ 97 - 98 - /* We can actually do significantly better than the Pixman macros, at least for 99 - * the case of fills, by using a carefully scheduled inner loop. Cortex-A53 100 - * shows an improvement of up to 78% in ideal cases (large fills to L1 cache). 101 - */ 102 - 103 - .macro generate_fillrect_function name, bpp, log2Bpp 104 - /* 105 - * void name(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src); 106 - * On entry: 107 - * a1 = width, pixels 108 - * a2 = height, rows 109 - * a3 = pointer to top-left destination pixel 110 - * a4 = stride, pixels 111 - * [sp] = pixel value to fill with 112 - * Within the function: 113 - * v1 = width remaining 114 - * v2 = vst offset 115 - * v3 = alternate pointer 116 - * ip = data ARM register 117 - */ 118 - pixman_asm_function name 119 - vld1.\bpp {d0[],d1[]}, [sp] 120 - sub a4, a1 121 - vld1.\bpp {d2[],d3[]}, [sp] 122 - cmp a1, #(15+64) >> \log2Bpp 123 - push {v1-v3,lr} 124 - vmov ip, s0 125 - blo 51f 126 - 127 - /* Long-row case */ 128 - mov v2, #64 129 - 1: mov v1, a1 130 - ands v3, a3, #15 131 - beq 2f 132 - /* Leading pixels */ 133 - rsb v3, v3, #16 /* number of leading bytes until 16-byte aligned */ 134 - sub v1, v1, v3, lsr #\log2Bpp 135 - rbit v3, v3 136 - .if bpp <= 16 137 - .if bpp == 8 138 - tst a3, #1 /* bit 0 unaffected by rsb so can avoid register interlock */ 139 - strneb ip, [a3], #1 140 - tst v3, #1<<30 141 - .else 142 - tst a3, #2 /* bit 1 unaffected by rsb (assuming halfword alignment) so can avoid register interlock */ 143 - .endif 144 - strneh ip, [a3], #2 145 - .endif 146 - movs v3, v3, lsl #3 147 - vstmcs a3!, {s0} 148 - vstmmi a3!, {d0} 149 - 2: sub v1, v1, #64 >> \log2Bpp /* simplifies inner loop termination */ 150 - add v3, a3, #32 151 - /* Inner loop */ 152 - 3: vst1.\bpp {q0-q1}, [a3 :128], v2 153 - subs v1, v1, #64 >> \log2Bpp 154 - vst1.\bpp {q0-q1}, [v3 :128], v2 155 - bhs 3b 156 - /* Trailing pixels */ 157 - 4: movs v1, v1, lsl #27 + \log2Bpp 158 - bcc 5f 159 - vst1.\bpp {q0-q1}, [a3 :128]! 160 - 5: bpl 6f 161 - vst1.\bpp {q0}, [a3 :128]! 162 - 6: movs v1, v1, lsl #2 163 - vstmcs a3!, {d0} 164 - vstmmi a3!, {s0} 165 - .if bpp <= 16 166 - movs v1, v1, lsl #2 167 - strcsh ip, [a3], #2 168 - .if bpp == 8 169 - strmib ip, [a3], #1 170 - .endif 171 - .endif 172 - subs a2, a2, #1 173 - add a3, a3, a4, lsl #\log2Bpp 174 - bhi 1b 175 - pop {v1-v3,pc} 176 - 177 - /* Short-row case */ 178 - 51: movs v1, a1 179 - .if bpp == 8 180 - tst a3, #3 181 - beq 53f 182 - 52: subs v1, v1, #1 183 - blo 57f 184 - strb ip, [a3], #1 185 - tst a3, #3 186 - bne 52b 187 - .elseif bpp == 16 188 - tstne a3, #2 189 - subne v1, v1, #1 190 - strneh ip, [a3], #2 191 - .endif 192 - 53: cmp v1, #32 >> \log2Bpp 193 - bcc 54f 194 - vst1.\bpp {q0-q1}, [a3]! 195 - sub v1, v1, #32 >> \log2Bpp 196 - /* Trailing pixels */ 197 - 54: movs v1, v1, lsl #27 + \log2Bpp 198 - bcc 55f 199 - vst1.\bpp {q0-q1}, [a3]! 200 - 55: bpl 56f 201 - vst1.\bpp {q0}, [a3]! 202 - 56: movs v1, v1, lsl #2 203 - vstmcs a3!, {d0} 204 - vstmmi a3!, {s0} 205 - .if bpp <= 16 206 - movs v1, v1, lsl #2 207 - strcsh ip, [a3], #2 208 - .if bpp == 8 209 - strmib ip, [a3], #1 210 - .endif 211 - .endif 212 - subs a2, a2, #1 213 - add a3, a3, a4, lsl #\log2Bpp 214 - bhi 51b 215 - 57: pop {v1-v3,pc} 216 - 217 - .endfunc 218 - .endm 219 - 220 - generate_fillrect_function FillSurfaceRect32ARMNEONAsm, 32, 2 221 - generate_fillrect_function FillSurfaceRect16ARMNEONAsm, 16, 1 222 - generate_fillrect_function FillSurfaceRect8ARMNEONAsm, 8, 0 223 - 224 - /******************************************************************************/ 225 - 226 - .macro RGBtoRGBPixelAlpha_process_pixblock_head 227 - vmvn d30, d3 /* get inverted source alpha */ 228 - vmov d31, d7 /* dest alpha is always unchanged */ 229 - vmull.u8 q14, d0, d3 230 - vmlal.u8 q14, d4, d30 231 - vmull.u8 q0, d1, d3 232 - vmlal.u8 q0, d5, d30 233 - vmull.u8 q1, d2, d3 234 - vmlal.u8 q1, d6, d30 235 - vrshr.u16 q2, q14, #8 236 - vrshr.u16 q3, q0, #8 237 - vraddhn.u16 d28, q14, q2 238 - vrshr.u16 q2, q1, #8 239 - vraddhn.u16 d29, q0, q3 240 - vraddhn.u16 d30, q1, q2 241 - .endm 242 - 243 - .macro RGBtoRGBPixelAlpha_process_pixblock_tail 244 - /* nothing */ 245 - .endm 246 - 247 - .macro RGBtoRGBPixelAlpha_process_pixblock_tail_head 248 - vld4.8 {d0-d3}, [SRC]! 249 - PF add PF_X, PF_X, #8 250 - vst4.8 {d28-d31}, [DST_W :128]! 251 - PF tst PF_CTL, #0xF 252 - vld4.8 {d4-d7}, [DST_R :128]! 253 - PF addne PF_X, PF_X, #8 254 - vmvn d30, d3 /* get inverted source alpha */ 255 - vmov d31, d7 /* dest alpha is always unchanged */ 256 - vmull.u8 q14, d0, d3 257 - PF subne PF_CTL, PF_CTL, #1 258 - vmlal.u8 q14, d4, d30 259 - PF cmp PF_X, ORIG_W 260 - vmull.u8 q0, d1, d3 261 - PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 262 - vmlal.u8 q0, d5, d30 263 - PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 264 - vmull.u8 q1, d2, d3 265 - PF subge PF_X, PF_X, ORIG_W 266 - vmlal.u8 q1, d6, d30 267 - PF subges PF_CTL, PF_CTL, #0x10 268 - vrshr.u16 q2, q14, #8 269 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 270 - vrshr.u16 q3, q0, #8 271 - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 272 - vraddhn.u16 d28, q14, q2 273 - vrshr.u16 q2, q1, #8 274 - vraddhn.u16 d29, q0, q3 275 - vraddhn.u16 d30, q1, q2 276 - .endm 277 - 278 - generate_composite_function \ 279 - BlitRGBtoRGBPixelAlphaARMNEONAsm, 32, 0, 32, \ 280 - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 281 - 8, /* number of pixels, processed in a single block */ \ 282 - 5, /* prefetch distance */ \ 283 - default_init, \ 284 - default_cleanup, \ 285 - RGBtoRGBPixelAlpha_process_pixblock_head, \ 286 - RGBtoRGBPixelAlpha_process_pixblock_tail, \ 287 - RGBtoRGBPixelAlpha_process_pixblock_tail_head 288 - 289 - /******************************************************************************/ 290 - 291 - .macro ARGBto565PixelAlpha_process_pixblock_head 292 - vmvn d6, d3 293 - vshr.u8 d1, #2 294 - vshr.u8 d3, #3 295 - vshr.u8 d0, #3 296 - vshrn.u16 d7, q2, #3 297 - vshrn.u16 d25, q2, #8 298 - vbic.i16 q2, #0xe0 299 - vshr.u8 d6, #3 300 - vshr.u8 d7, #2 301 - vshr.u8 d2, #3 302 - vmovn.u16 d24, q2 303 - vshr.u8 d25, #3 304 - vmull.u8 q13, d1, d3 305 - vmlal.u8 q13, d7, d6 306 - vmull.u8 q14, d0, d3 307 - vmlal.u8 q14, d24, d6 308 - vmull.u8 q15, d2, d3 309 - vmlal.u8 q15, d25, d6 310 - .endm 311 - 312 - .macro ARGBto565PixelAlpha_process_pixblock_tail 313 - vsra.u16 q13, #5 314 - vsra.u16 q14, #5 315 - vsra.u16 q15, #5 316 - vrshr.u16 q13, #5 317 - vrshr.u16 q14, #5 318 - vrshr.u16 q15, #5 319 - vsli.u16 q14, q13, #5 320 - vsli.u16 q14, q15, #11 321 - .endm 322 - 323 - .macro ARGBto565PixelAlpha_process_pixblock_tail_head 324 - vld4.8 {d0-d3}, [SRC]! 325 - PF add PF_X, PF_X, #8 326 - vsra.u16 q13, #5 327 - PF tst PF_CTL, #0xF 328 - vsra.u16 q14, #5 329 - PF addne PF_X, PF_X, #8 330 - vsra.u16 q15, #5 331 - PF subne PF_CTL, PF_CTL, #1 332 - vrshr.u16 q13, #5 333 - PF cmp PF_X, ORIG_W 334 - vrshr.u16 q14, #5 335 - PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 336 - vrshr.u16 q15, #5 337 - PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 338 - vld1.8 {d4-d5}, [DST_R]! 339 - PF subge PF_X, PF_X, ORIG_W 340 - vsli.u16 q14, q13, #5 341 - PF subges PF_CTL, PF_CTL, #0x10 342 - vsli.u16 q14, q15, #11 343 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 344 - vst1.8 {q14}, [DST_W :128]! 345 - vmvn d6, d3 346 - vshr.u8 d1, #2 347 - vshr.u8 d3, #3 348 - vshr.u8 d0, #3 349 - vshrn.u16 d7, q2, #3 350 - vshrn.u16 d25, q2, #8 351 - vbic.i16 q2, #0xe0 352 - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 353 - vshr.u8 d6, #3 354 - vshr.u8 d7, #2 355 - vshr.u8 d2, #3 356 - vmovn.u16 d24, q2 357 - vshr.u8 d25, #3 358 - vmull.u8 q13, d1, d3 359 - vmlal.u8 q13, d7, d6 360 - vmull.u8 q14, d0, d3 361 - vmlal.u8 q14, d24, d6 362 - vmull.u8 q15, d2, d3 363 - vmlal.u8 q15, d25, d6 364 - .endm 365 - 366 - generate_composite_function \ 367 - BlitARGBto565PixelAlphaARMNEONAsm, 32, 0, 16, \ 368 - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 369 - 8, /* number of pixels, processed in a single block */ \ 370 - 6, /* prefetch distance */ \ 371 - default_init, \ 372 - default_cleanup, \ 373 - ARGBto565PixelAlpha_process_pixblock_head, \ 374 - ARGBto565PixelAlpha_process_pixblock_tail, \ 375 - ARGBto565PixelAlpha_process_pixblock_tail_head

-1184

src/video/arm/pixman-arm-neon-asm.h

··· 1 - /* 2 - * Copyright © 2009 Nokia Corporation 3 - * 4 - * Permission is hereby granted, free of charge, to any person obtaining a 5 - * copy of this software and associated documentation files (the "Software"), 6 - * to deal in the Software without restriction, including without limitation 7 - * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 - * and/or sell copies of the Software, and to permit persons to whom the 9 - * Software is furnished to do so, subject to the following conditions: 10 - * 11 - * The above copyright notice and this permission notice (including the next 12 - * paragraph) shall be included in all copies or substantial portions of the 13 - * Software. 14 - * 15 - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 - * DEALINGS IN THE SOFTWARE. 22 - * 23 - * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) 24 - */ 25 - 26 - /* 27 - * This file contains a macro ('generate_composite_function') which can 28 - * construct 2D image processing functions, based on a common template. 29 - * Any combinations of source, destination and mask images with 8bpp, 30 - * 16bpp, 24bpp, 32bpp color formats are supported. 31 - * 32 - * This macro takes care of: 33 - * - handling of leading and trailing unaligned pixels 34 - * - doing most of the work related to L2 cache preload 35 - * - encourages the use of software pipelining for better instructions 36 - * scheduling 37 - * 38 - * The user of this macro has to provide some configuration parameters 39 - * (bit depths for the images, prefetch distance, etc.) and a set of 40 - * macros, which should implement basic code chunks responsible for 41 - * pixels processing. See 'pixman-arm-neon-asm.S' file for the usage 42 - * examples. 43 - * 44 - * TODO: 45 - * - try overlapped pixel method (from Ian Rickards) when processing 46 - * exactly two blocks of pixels 47 - * - maybe add an option to do reverse scanline processing 48 - */ 49 - 50 - /* 51 - * Bit flags for 'generate_composite_function' macro which are used 52 - * to tune generated functions behavior. 53 - */ 54 - .set FLAG_DST_WRITEONLY, 0 55 - .set FLAG_DST_READWRITE, 1 56 - .set FLAG_DEINTERLEAVE_32BPP, 2 57 - 58 - /* 59 - * Offset in stack where mask and source pointer/stride can be accessed 60 - * from 'init' macro. This is useful for doing special handling for solid mask. 61 - */ 62 - .set ARGS_STACK_OFFSET, 40 63 - 64 - /* 65 - * Constants for selecting preferable prefetch type. 66 - */ 67 - .set PREFETCH_TYPE_NONE, 0 /* No prefetch at all */ 68 - .set PREFETCH_TYPE_SIMPLE, 1 /* A simple, fixed-distance-ahead prefetch */ 69 - .set PREFETCH_TYPE_ADVANCED, 2 /* Advanced fine-grained prefetch */ 70 - 71 - /* 72 - * Definitions of supplementary pixld/pixst macros (for partial load/store of 73 - * pixel data). 74 - */ 75 - 76 - .macro pixldst1 op, elem_size, reg1, mem_operand, abits 77 - .if abits > 0 78 - op&.&elem_size {d&reg1}, [&mem_operand&, :&abits&]! 79 - .else 80 - op&.&elem_size {d&reg1}, [&mem_operand&]! 81 - .endif 82 - .endm 83 - 84 - .macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits 85 - .if abits > 0 86 - op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&, :&abits&]! 87 - .else 88 - op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&]! 89 - .endif 90 - .endm 91 - 92 - .macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits 93 - .if abits > 0 94 - op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&, :&abits&]! 95 - .else 96 - op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&]! 97 - .endif 98 - .endm 99 - 100 - .macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits 101 - op&.&elem_size {d&reg1[idx]}, [&mem_operand&]! 102 - .endm 103 - 104 - .macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand 105 - op&.&elem_size {d&reg1, d&reg2, d&reg3}, [&mem_operand&]! 106 - .endm 107 - 108 - .macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand 109 - op&.&elem_size {d&reg1[idx], d&reg2[idx], d&reg3[idx]}, [&mem_operand&]! 110 - .endm 111 - 112 - .macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits 113 - .if numbytes == 32 114 - pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \ 115 - %(basereg+6), %(basereg+7), mem_operand, abits 116 - .elseif numbytes == 16 117 - pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits 118 - .elseif numbytes == 8 119 - pixldst1 op, elem_size, %(basereg+1), mem_operand, abits 120 - .elseif numbytes == 4 121 - .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32) 122 - pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits 123 - .elseif elem_size == 16 124 - pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits 125 - pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits 126 - .else 127 - pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits 128 - pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits 129 - pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits 130 - pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits 131 - .endif 132 - .elseif numbytes == 2 133 - .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16) 134 - pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits 135 - .else 136 - pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits 137 - pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits 138 - .endif 139 - .elseif numbytes == 1 140 - pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits 141 - .else 142 - .error "unsupported size: numbytes" 143 - .endif 144 - .endm 145 - 146 - .macro pixld numpix, bpp, basereg, mem_operand, abits=0 147 - .if bpp > 0 148 - .if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) 149 - pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \ 150 - %(basereg+6), %(basereg+7), mem_operand, abits 151 - .elseif (bpp == 24) && (numpix == 8) 152 - pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand 153 - .elseif (bpp == 24) && (numpix == 4) 154 - pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand 155 - pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand 156 - pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand 157 - pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand 158 - .elseif (bpp == 24) && (numpix == 2) 159 - pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand 160 - pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand 161 - .elseif (bpp == 24) && (numpix == 1) 162 - pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand 163 - .else 164 - pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits 165 - .endif 166 - .endif 167 - .endm 168 - 169 - .macro pixst numpix, bpp, basereg, mem_operand, abits=0 170 - .if bpp > 0 171 - .if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) 172 - pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \ 173 - %(basereg+6), %(basereg+7), mem_operand, abits 174 - .elseif (bpp == 24) && (numpix == 8) 175 - pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand 176 - .elseif (bpp == 24) && (numpix == 4) 177 - pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand 178 - pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand 179 - pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand 180 - pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand 181 - .elseif (bpp == 24) && (numpix == 2) 182 - pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand 183 - pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand 184 - .elseif (bpp == 24) && (numpix == 1) 185 - pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand 186 - .else 187 - pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits 188 - .endif 189 - .endif 190 - .endm 191 - 192 - .macro pixld_a numpix, bpp, basereg, mem_operand 193 - .if (bpp * numpix) <= 128 194 - pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix) 195 - .else 196 - pixld numpix, bpp, basereg, mem_operand, 128 197 - .endif 198 - .endm 199 - 200 - .macro pixst_a numpix, bpp, basereg, mem_operand 201 - .if (bpp * numpix) <= 128 202 - pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix) 203 - .else 204 - pixst numpix, bpp, basereg, mem_operand, 128 205 - .endif 206 - .endm 207 - 208 - /* 209 - * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register 210 - * aliases to be defined) 211 - */ 212 - .macro pixld1_s elem_size, reg1, mem_operand 213 - .if elem_size == 16 214 - mov TMP1, VX, asr #16 215 - adds VX, VX, UNIT_X 216 - 5: subpls VX, VX, SRC_WIDTH_FIXED 217 - bpl 5b 218 - add TMP1, mem_operand, TMP1, asl #1 219 - mov TMP2, VX, asr #16 220 - adds VX, VX, UNIT_X 221 - 5: subpls VX, VX, SRC_WIDTH_FIXED 222 - bpl 5b 223 - add TMP2, mem_operand, TMP2, asl #1 224 - vld1.16 {d&reg1&[0]}, [TMP1, :16] 225 - mov TMP1, VX, asr #16 226 - adds VX, VX, UNIT_X 227 - 5: subpls VX, VX, SRC_WIDTH_FIXED 228 - bpl 5b 229 - add TMP1, mem_operand, TMP1, asl #1 230 - vld1.16 {d&reg1&[1]}, [TMP2, :16] 231 - mov TMP2, VX, asr #16 232 - adds VX, VX, UNIT_X 233 - 5: subpls VX, VX, SRC_WIDTH_FIXED 234 - bpl 5b 235 - add TMP2, mem_operand, TMP2, asl #1 236 - vld1.16 {d&reg1&[2]}, [TMP1, :16] 237 - vld1.16 {d&reg1&[3]}, [TMP2, :16] 238 - .elseif elem_size == 32 239 - mov TMP1, VX, asr #16 240 - adds VX, VX, UNIT_X 241 - 5: subpls VX, VX, SRC_WIDTH_FIXED 242 - bpl 5b 243 - add TMP1, mem_operand, TMP1, asl #2 244 - mov TMP2, VX, asr #16 245 - adds VX, VX, UNIT_X 246 - 5: subpls VX, VX, SRC_WIDTH_FIXED 247 - bpl 5b 248 - add TMP2, mem_operand, TMP2, asl #2 249 - vld1.32 {d&reg1&[0]}, [TMP1, :32] 250 - vld1.32 {d&reg1&[1]}, [TMP2, :32] 251 - .else 252 - .error "unsupported" 253 - .endif 254 - .endm 255 - 256 - .macro pixld2_s elem_size, reg1, reg2, mem_operand 257 - .if 0 /* elem_size == 32 */ 258 - mov TMP1, VX, asr #16 259 - add VX, VX, UNIT_X, asl #1 260 - add TMP1, mem_operand, TMP1, asl #2 261 - mov TMP2, VX, asr #16 262 - sub VX, VX, UNIT_X 263 - add TMP2, mem_operand, TMP2, asl #2 264 - vld1.32 {d&reg1&[0]}, [TMP1, :32] 265 - mov TMP1, VX, asr #16 266 - add VX, VX, UNIT_X, asl #1 267 - add TMP1, mem_operand, TMP1, asl #2 268 - vld1.32 {d&reg2&[0]}, [TMP2, :32] 269 - mov TMP2, VX, asr #16 270 - add VX, VX, UNIT_X 271 - add TMP2, mem_operand, TMP2, asl #2 272 - vld1.32 {d&reg1&[1]}, [TMP1, :32] 273 - vld1.32 {d&reg2&[1]}, [TMP2, :32] 274 - .else 275 - pixld1_s elem_size, reg1, mem_operand 276 - pixld1_s elem_size, reg2, mem_operand 277 - .endif 278 - .endm 279 - 280 - .macro pixld0_s elem_size, reg1, idx, mem_operand 281 - .if elem_size == 16 282 - mov TMP1, VX, asr #16 283 - adds VX, VX, UNIT_X 284 - 5: subpls VX, VX, SRC_WIDTH_FIXED 285 - bpl 5b 286 - add TMP1, mem_operand, TMP1, asl #1 287 - vld1.16 {d&reg1&[idx]}, [TMP1, :16] 288 - .elseif elem_size == 32 289 - mov TMP1, VX, asr #16 290 - adds VX, VX, UNIT_X 291 - 5: subpls VX, VX, SRC_WIDTH_FIXED 292 - bpl 5b 293 - add TMP1, mem_operand, TMP1, asl #2 294 - vld1.32 {d&reg1&[idx]}, [TMP1, :32] 295 - .endif 296 - .endm 297 - 298 - .macro pixld_s_internal numbytes, elem_size, basereg, mem_operand 299 - .if numbytes == 32 300 - pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand 301 - pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand 302 - pixdeinterleave elem_size, %(basereg+4) 303 - .elseif numbytes == 16 304 - pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand 305 - .elseif numbytes == 8 306 - pixld1_s elem_size, %(basereg+1), mem_operand 307 - .elseif numbytes == 4 308 - .if elem_size == 32 309 - pixld0_s elem_size, %(basereg+0), 1, mem_operand 310 - .elseif elem_size == 16 311 - pixld0_s elem_size, %(basereg+0), 2, mem_operand 312 - pixld0_s elem_size, %(basereg+0), 3, mem_operand 313 - .else 314 - pixld0_s elem_size, %(basereg+0), 4, mem_operand 315 - pixld0_s elem_size, %(basereg+0), 5, mem_operand 316 - pixld0_s elem_size, %(basereg+0), 6, mem_operand 317 - pixld0_s elem_size, %(basereg+0), 7, mem_operand 318 - .endif 319 - .elseif numbytes == 2 320 - .if elem_size == 16 321 - pixld0_s elem_size, %(basereg+0), 1, mem_operand 322 - .else 323 - pixld0_s elem_size, %(basereg+0), 2, mem_operand 324 - pixld0_s elem_size, %(basereg+0), 3, mem_operand 325 - .endif 326 - .elseif numbytes == 1 327 - pixld0_s elem_size, %(basereg+0), 1, mem_operand 328 - .else 329 - .error "unsupported size: numbytes" 330 - .endif 331 - .endm 332 - 333 - .macro pixld_s numpix, bpp, basereg, mem_operand 334 - .if bpp > 0 335 - pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand 336 - .endif 337 - .endm 338 - 339 - .macro vuzp8 reg1, reg2 340 - vuzp.8 d&reg1, d&reg2 341 - .endm 342 - 343 - .macro vzip8 reg1, reg2 344 - vzip.8 d&reg1, d&reg2 345 - .endm 346 - 347 - /* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ 348 - .macro pixdeinterleave bpp, basereg 349 - .if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) 350 - vuzp8 %(basereg+0), %(basereg+1) 351 - vuzp8 %(basereg+2), %(basereg+3) 352 - vuzp8 %(basereg+1), %(basereg+3) 353 - vuzp8 %(basereg+0), %(basereg+2) 354 - .endif 355 - .endm 356 - 357 - /* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ 358 - .macro pixinterleave bpp, basereg 359 - .if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) 360 - vzip8 %(basereg+0), %(basereg+2) 361 - vzip8 %(basereg+1), %(basereg+3) 362 - vzip8 %(basereg+2), %(basereg+3) 363 - vzip8 %(basereg+0), %(basereg+1) 364 - .endif 365 - .endm 366 - 367 - /* 368 - * This is a macro for implementing cache preload. The main idea is that 369 - * cache preload logic is mostly independent from the rest of pixels 370 - * processing code. It starts at the top left pixel and moves forward 371 - * across pixels and can jump across scanlines. Prefetch distance is 372 - * handled in an 'incremental' way: it starts from 0 and advances to the 373 - * optimal distance over time. After reaching optimal prefetch distance, 374 - * it is kept constant. There are some checks which prevent prefetching 375 - * unneeded pixel lines below the image (but it still can prefetch a bit 376 - * more data on the right side of the image - not a big issue and may 377 - * be actually helpful when rendering text glyphs). Additional trick is 378 - * the use of LDR instruction for prefetch instead of PLD when moving to 379 - * the next line, the point is that we have a high chance of getting TLB 380 - * miss in this case, and PLD would be useless. 381 - * 382 - * This sounds like it may introduce a noticeable overhead (when working with 383 - * fully cached data). But in reality, due to having a separate pipeline and 384 - * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can 385 - * execute simultaneously with NEON and be completely shadowed by it. Thus 386 - * we get no performance overhead at all (*). This looks like a very nice 387 - * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher, 388 - * but still can implement some rather advanced prefetch logic in software 389 - * for almost zero cost! 390 - * 391 - * (*) The overhead of the prefetcher is visible when running some trivial 392 - * pixels processing like simple copy. Anyway, having prefetch is a must 393 - * when working with the graphics data. 394 - */ 395 - .macro PF a, x:vararg 396 - .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED) 397 - a x 398 - .endif 399 - .endm 400 - 401 - .macro cache_preload std_increment, boost_increment 402 - .if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0) 403 - .if regs_shortage 404 - PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */ 405 - .endif 406 - .if std_increment != 0 407 - PF add PF_X, PF_X, #std_increment 408 - .endif 409 - PF tst PF_CTL, #0xF 410 - PF addne PF_X, PF_X, #boost_increment 411 - PF subne PF_CTL, PF_CTL, #1 412 - PF cmp PF_X, ORIG_W 413 - .if src_bpp_shift >= 0 414 - PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 415 - .endif 416 - .if dst_r_bpp != 0 417 - PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 418 - .endif 419 - .if mask_bpp_shift >= 0 420 - PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] 421 - .endif 422 - PF subge PF_X, PF_X, ORIG_W 423 - PF subges PF_CTL, PF_CTL, #0x10 424 - .if src_bpp_shift >= 0 425 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 426 - .endif 427 - .if dst_r_bpp != 0 428 - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 429 - .endif 430 - .if mask_bpp_shift >= 0 431 - PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! 432 - .endif 433 - .endif 434 - .endm 435 - 436 - .macro cache_preload_simple 437 - .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE) 438 - .if src_bpp > 0 439 - pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)] 440 - .endif 441 - .if dst_r_bpp > 0 442 - pld [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)] 443 - .endif 444 - .if mask_bpp > 0 445 - pld [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)] 446 - .endif 447 - .endif 448 - .endm 449 - 450 - .macro fetch_mask_pixblock 451 - pixld pixblock_size, mask_bpp, \ 452 - (mask_basereg - pixblock_size * mask_bpp / 64), MASK 453 - .endm 454 - 455 - /* 456 - * Macro which is used to process leading pixels until destination 457 - * pointer is properly aligned (at 16 bytes boundary). When destination 458 - * buffer uses 16bpp format, this is unnecessary, or even pointless. 459 - */ 460 - .macro ensure_destination_ptr_alignment process_pixblock_head, \ 461 - process_pixblock_tail, \ 462 - process_pixblock_tail_head 463 - .if dst_w_bpp != 24 464 - tst DST_R, #0xF 465 - beq 2f 466 - 467 - .irp lowbit, 1, 2, 4, 8, 16 468 - local skip1 469 - .if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) 470 - .if lowbit < 16 /* we don't need more than 16-byte alignment */ 471 - tst DST_R, #lowbit 472 - beq 1f 473 - .endif 474 - pixld_src (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC 475 - pixld (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK 476 - .if dst_r_bpp > 0 477 - pixld_a (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R 478 - .else 479 - add DST_R, DST_R, #lowbit 480 - .endif 481 - PF add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp) 482 - sub W, W, #(lowbit * 8 / dst_w_bpp) 483 - 1: 484 - .endif 485 - .endr 486 - pixdeinterleave src_bpp, src_basereg 487 - pixdeinterleave mask_bpp, mask_basereg 488 - pixdeinterleave dst_r_bpp, dst_r_basereg 489 - 490 - process_pixblock_head 491 - cache_preload 0, pixblock_size 492 - cache_preload_simple 493 - process_pixblock_tail 494 - 495 - pixinterleave dst_w_bpp, dst_w_basereg 496 - .irp lowbit, 1, 2, 4, 8, 16 497 - .if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) 498 - .if lowbit < 16 /* we don't need more than 16-byte alignment */ 499 - tst DST_W, #lowbit 500 - beq 1f 501 - .endif 502 - pixst_a (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W 503 - 1: 504 - .endif 505 - .endr 506 - .endif 507 - 2: 508 - .endm 509 - 510 - /* 511 - * Special code for processing up to (pixblock_size - 1) remaining 512 - * trailing pixels. As SIMD processing performs operation on 513 - * pixblock_size pixels, anything smaller than this has to be loaded 514 - * and stored in a special way. Loading and storing of pixel data is 515 - * performed in such a way that we fill some 'slots' in the NEON 516 - * registers (some slots naturally are unused), then perform compositing 517 - * operation as usual. In the end, the data is taken from these 'slots' 518 - * and saved to memory. 519 - * 520 - * cache_preload_flag - allows to suppress prefetch if 521 - * set to 0 522 - * dst_aligned_flag - selects whether destination buffer 523 - * is aligned 524 - */ 525 - .macro process_trailing_pixels cache_preload_flag, \ 526 - dst_aligned_flag, \ 527 - process_pixblock_head, \ 528 - process_pixblock_tail, \ 529 - process_pixblock_tail_head 530 - tst W, #(pixblock_size - 1) 531 - beq 2f 532 - .irp chunk_size, 16, 8, 4, 2, 1 533 - .if pixblock_size > chunk_size 534 - tst W, #chunk_size 535 - beq 1f 536 - pixld_src chunk_size, src_bpp, src_basereg, SRC 537 - pixld chunk_size, mask_bpp, mask_basereg, MASK 538 - .if dst_aligned_flag != 0 539 - pixld_a chunk_size, dst_r_bpp, dst_r_basereg, DST_R 540 - .else 541 - pixld chunk_size, dst_r_bpp, dst_r_basereg, DST_R 542 - .endif 543 - .if cache_preload_flag != 0 544 - PF add PF_X, PF_X, #chunk_size 545 - .endif 546 - 1: 547 - .endif 548 - .endr 549 - pixdeinterleave src_bpp, src_basereg 550 - pixdeinterleave mask_bpp, mask_basereg 551 - pixdeinterleave dst_r_bpp, dst_r_basereg 552 - 553 - process_pixblock_head 554 - .if cache_preload_flag != 0 555 - cache_preload 0, pixblock_size 556 - cache_preload_simple 557 - .endif 558 - process_pixblock_tail 559 - pixinterleave dst_w_bpp, dst_w_basereg 560 - .irp chunk_size, 16, 8, 4, 2, 1 561 - .if pixblock_size > chunk_size 562 - tst W, #chunk_size 563 - beq 1f 564 - .if dst_aligned_flag != 0 565 - pixst_a chunk_size, dst_w_bpp, dst_w_basereg, DST_W 566 - .else 567 - pixst chunk_size, dst_w_bpp, dst_w_basereg, DST_W 568 - .endif 569 - 1: 570 - .endif 571 - .endr 572 - 2: 573 - .endm 574 - 575 - /* 576 - * Macro, which performs all the needed operations to switch to the next 577 - * scanline and start the next loop iteration unless all the scanlines 578 - * are already processed. 579 - */ 580 - .macro advance_to_next_scanline start_of_loop_label 581 - .if regs_shortage 582 - ldrd W, [sp] /* load W and H (width and height) from stack */ 583 - .else 584 - mov W, ORIG_W 585 - .endif 586 - add DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift 587 - .if src_bpp != 0 588 - add SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift 589 - .endif 590 - .if mask_bpp != 0 591 - add MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift 592 - .endif 593 - .if (dst_w_bpp != 24) 594 - sub DST_W, DST_W, W, lsl #dst_bpp_shift 595 - .endif 596 - .if (src_bpp != 24) && (src_bpp != 0) 597 - sub SRC, SRC, W, lsl #src_bpp_shift 598 - .endif 599 - .if (mask_bpp != 24) && (mask_bpp != 0) 600 - sub MASK, MASK, W, lsl #mask_bpp_shift 601 - .endif 602 - subs H, H, #1 603 - mov DST_R, DST_W 604 - .if regs_shortage 605 - str H, [sp, #4] /* save updated height to stack */ 606 - .endif 607 - bge start_of_loop_label 608 - .endm 609 - 610 - /* 611 - * Registers are allocated in the following way by default: 612 - * d0, d1, d2, d3 - reserved for loading source pixel data 613 - * d4, d5, d6, d7 - reserved for loading destination pixel data 614 - * d24, d25, d26, d27 - reserved for loading mask pixel data 615 - * d28, d29, d30, d31 - final destination pixel data for writeback to memory 616 - */ 617 - .macro generate_composite_function fname, \ 618 - src_bpp_, \ 619 - mask_bpp_, \ 620 - dst_w_bpp_, \ 621 - flags, \ 622 - pixblock_size_, \ 623 - prefetch_distance, \ 624 - init, \ 625 - cleanup, \ 626 - process_pixblock_head, \ 627 - process_pixblock_tail, \ 628 - process_pixblock_tail_head, \ 629 - dst_w_basereg_ = 28, \ 630 - dst_r_basereg_ = 4, \ 631 - src_basereg_ = 0, \ 632 - mask_basereg_ = 24 633 - 634 - pixman_asm_function fname 635 - 636 - push {r4-r12, lr} /* save all registers */ 637 - 638 - /* 639 - * Select prefetch type for this function. If prefetch distance is 640 - * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch 641 - * has to be used instead of ADVANCED. 642 - */ 643 - .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT 644 - .if prefetch_distance == 0 645 - .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE 646 - .elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \ 647 - ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24)) 648 - .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE 649 - .endif 650 - 651 - /* 652 - * Make some macro arguments globally visible and accessible 653 - * from other macros 654 - */ 655 - .set src_bpp, src_bpp_ 656 - .set mask_bpp, mask_bpp_ 657 - .set dst_w_bpp, dst_w_bpp_ 658 - .set pixblock_size, pixblock_size_ 659 - .set dst_w_basereg, dst_w_basereg_ 660 - .set dst_r_basereg, dst_r_basereg_ 661 - .set src_basereg, src_basereg_ 662 - .set mask_basereg, mask_basereg_ 663 - 664 - .macro pixld_src x:vararg 665 - pixld x 666 - .endm 667 - .macro fetch_src_pixblock 668 - pixld_src pixblock_size, src_bpp, \ 669 - (src_basereg - pixblock_size * src_bpp / 64), SRC 670 - .endm 671 - /* 672 - * Assign symbolic names to registers 673 - */ 674 - W .req r0 /* width (is updated during processing) */ 675 - H .req r1 /* height (is updated during processing) */ 676 - DST_W .req r2 /* destination buffer pointer for writes */ 677 - DST_STRIDE .req r3 /* destination image stride */ 678 - SRC .req r4 /* source buffer pointer */ 679 - SRC_STRIDE .req r5 /* source image stride */ 680 - DST_R .req r6 /* destination buffer pointer for reads */ 681 - 682 - MASK .req r7 /* mask pointer */ 683 - MASK_STRIDE .req r8 /* mask stride */ 684 - 685 - PF_CTL .req r9 /* combined lines counter and prefetch */ 686 - /* distance increment counter */ 687 - PF_X .req r10 /* pixel index in a scanline for current */ 688 - /* pretetch position */ 689 - PF_SRC .req r11 /* pointer to source scanline start */ 690 - /* for prefetch purposes */ 691 - PF_DST .req r12 /* pointer to destination scanline start */ 692 - /* for prefetch purposes */ 693 - PF_MASK .req r14 /* pointer to mask scanline start */ 694 - /* for prefetch purposes */ 695 - /* 696 - * Check whether we have enough registers for all the local variables. 697 - * If we don't have enough registers, original width and height are 698 - * kept on top of stack (and 'regs_shortage' variable is set to indicate 699 - * this for the rest of code). Even if there are enough registers, the 700 - * allocation scheme may be a bit different depending on whether source 701 - * or mask is not used. 702 - */ 703 - .if (PREFETCH_TYPE_CURRENT < PREFETCH_TYPE_ADVANCED) 704 - ORIG_W .req r10 /* saved original width */ 705 - DUMMY .req r12 /* temporary register */ 706 - .set regs_shortage, 0 707 - .elseif mask_bpp == 0 708 - ORIG_W .req r7 /* saved original width */ 709 - DUMMY .req r8 /* temporary register */ 710 - .set regs_shortage, 0 711 - .elseif src_bpp == 0 712 - ORIG_W .req r4 /* saved original width */ 713 - DUMMY .req r5 /* temporary register */ 714 - .set regs_shortage, 0 715 - .else 716 - ORIG_W .req r1 /* saved original width */ 717 - DUMMY .req r1 /* temporary register */ 718 - .set regs_shortage, 1 719 - .endif 720 - 721 - .set mask_bpp_shift, -1 722 - .if src_bpp == 32 723 - .set src_bpp_shift, 2 724 - .elseif src_bpp == 24 725 - .set src_bpp_shift, 0 726 - .elseif src_bpp == 16 727 - .set src_bpp_shift, 1 728 - .elseif src_bpp == 8 729 - .set src_bpp_shift, 0 730 - .elseif src_bpp == 0 731 - .set src_bpp_shift, -1 732 - .else 733 - .error "requested src bpp (src_bpp) is not supported" 734 - .endif 735 - .if mask_bpp == 32 736 - .set mask_bpp_shift, 2 737 - .elseif mask_bpp == 24 738 - .set mask_bpp_shift, 0 739 - .elseif mask_bpp == 8 740 - .set mask_bpp_shift, 0 741 - .elseif mask_bpp == 0 742 - .set mask_bpp_shift, -1 743 - .else 744 - .error "requested mask bpp (mask_bpp) is not supported" 745 - .endif 746 - .if dst_w_bpp == 32 747 - .set dst_bpp_shift, 2 748 - .elseif dst_w_bpp == 24 749 - .set dst_bpp_shift, 0 750 - .elseif dst_w_bpp == 16 751 - .set dst_bpp_shift, 1 752 - .elseif dst_w_bpp == 8 753 - .set dst_bpp_shift, 0 754 - .else 755 - .error "requested dst bpp (dst_w_bpp) is not supported" 756 - .endif 757 - 758 - .if (((flags) & FLAG_DST_READWRITE) != 0) 759 - .set dst_r_bpp, dst_w_bpp 760 - .else 761 - .set dst_r_bpp, 0 762 - .endif 763 - .if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0) 764 - .set DEINTERLEAVE_32BPP_ENABLED, 1 765 - .else 766 - .set DEINTERLEAVE_32BPP_ENABLED, 0 767 - .endif 768 - 769 - .if prefetch_distance < 0 || prefetch_distance > 15 770 - .error "invalid prefetch distance (prefetch_distance)" 771 - .endif 772 - 773 - .if src_bpp > 0 774 - ldr SRC, [sp, #40] 775 - .endif 776 - .if mask_bpp > 0 777 - ldr MASK, [sp, #48] 778 - .endif 779 - PF mov PF_X, #0 780 - .if src_bpp > 0 781 - ldr SRC_STRIDE, [sp, #44] 782 - .endif 783 - .if mask_bpp > 0 784 - ldr MASK_STRIDE, [sp, #52] 785 - .endif 786 - mov DST_R, DST_W 787 - 788 - .if src_bpp == 24 789 - sub SRC_STRIDE, SRC_STRIDE, W 790 - sub SRC_STRIDE, SRC_STRIDE, W, lsl #1 791 - .endif 792 - .if mask_bpp == 24 793 - sub MASK_STRIDE, MASK_STRIDE, W 794 - sub MASK_STRIDE, MASK_STRIDE, W, lsl #1 795 - .endif 796 - .if dst_w_bpp == 24 797 - sub DST_STRIDE, DST_STRIDE, W 798 - sub DST_STRIDE, DST_STRIDE, W, lsl #1 799 - .endif 800 - 801 - /* 802 - * Setup advanced prefetcher initial state 803 - */ 804 - PF mov PF_SRC, SRC 805 - PF mov PF_DST, DST_R 806 - PF mov PF_MASK, MASK 807 - /* PF_CTL = prefetch_distance | ((h - 1) << 4) */ 808 - PF mov PF_CTL, H, lsl #4 809 - PF add PF_CTL, #(prefetch_distance - 0x10) 810 - 811 - init 812 - .if regs_shortage 813 - push {r0, r1} 814 - .endif 815 - subs H, H, #1 816 - .if regs_shortage 817 - str H, [sp, #4] /* save updated height to stack */ 818 - .else 819 - mov ORIG_W, W 820 - .endif 821 - blt 9f 822 - cmp W, #(pixblock_size * 2) 823 - blt 8f 824 - /* 825 - * This is the start of the pipelined loop, which if optimized for 826 - * long scanlines 827 - */ 828 - 0: 829 - ensure_destination_ptr_alignment process_pixblock_head, \ 830 - process_pixblock_tail, \ 831 - process_pixblock_tail_head 832 - 833 - /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ 834 - pixld_a pixblock_size, dst_r_bpp, \ 835 - (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R 836 - fetch_src_pixblock 837 - pixld pixblock_size, mask_bpp, \ 838 - (mask_basereg - pixblock_size * mask_bpp / 64), MASK 839 - PF add PF_X, PF_X, #pixblock_size 840 - process_pixblock_head 841 - cache_preload 0, pixblock_size 842 - cache_preload_simple 843 - subs W, W, #(pixblock_size * 2) 844 - blt 2f 845 - 1: 846 - process_pixblock_tail_head 847 - cache_preload_simple 848 - subs W, W, #pixblock_size 849 - bge 1b 850 - 2: 851 - process_pixblock_tail 852 - pixst_a pixblock_size, dst_w_bpp, \ 853 - (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W 854 - 855 - /* Process the remaining trailing pixels in the scanline */ 856 - process_trailing_pixels 1, 1, \ 857 - process_pixblock_head, \ 858 - process_pixblock_tail, \ 859 - process_pixblock_tail_head 860 - advance_to_next_scanline 0b 861 - 862 - .if regs_shortage 863 - pop {r0, r1} 864 - .endif 865 - cleanup 866 - pop {r4-r12, pc} /* exit */ 867 - /* 868 - * This is the start of the loop, designed to process images with small width 869 - * (less than pixblock_size * 2 pixels). In this case neither pipelining 870 - * nor prefetch are used. 871 - */ 872 - 8: 873 - /* Process exactly pixblock_size pixels if needed */ 874 - tst W, #pixblock_size 875 - beq 1f 876 - pixld pixblock_size, dst_r_bpp, \ 877 - (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R 878 - fetch_src_pixblock 879 - pixld pixblock_size, mask_bpp, \ 880 - (mask_basereg - pixblock_size * mask_bpp / 64), MASK 881 - process_pixblock_head 882 - process_pixblock_tail 883 - pixst pixblock_size, dst_w_bpp, \ 884 - (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W 885 - 1: 886 - /* Process the remaining trailing pixels in the scanline */ 887 - process_trailing_pixels 0, 0, \ 888 - process_pixblock_head, \ 889 - process_pixblock_tail, \ 890 - process_pixblock_tail_head 891 - advance_to_next_scanline 8b 892 - 9: 893 - .if regs_shortage 894 - pop {r0, r1} 895 - .endif 896 - cleanup 897 - pop {r4-r12, pc} /* exit */ 898 - 899 - .purgem fetch_src_pixblock 900 - .purgem pixld_src 901 - 902 - .unreq SRC 903 - .unreq MASK 904 - .unreq DST_R 905 - .unreq DST_W 906 - .unreq ORIG_W 907 - .unreq W 908 - .unreq H 909 - .unreq SRC_STRIDE 910 - .unreq DST_STRIDE 911 - .unreq MASK_STRIDE 912 - .unreq PF_CTL 913 - .unreq PF_X 914 - .unreq PF_SRC 915 - .unreq PF_DST 916 - .unreq PF_MASK 917 - .unreq DUMMY 918 - .endfunc 919 - .endm 920 - 921 - /* 922 - * A simplified variant of function generation template for a single 923 - * scanline processing (for implementing pixman combine functions) 924 - */ 925 - .macro generate_composite_function_scanline use_nearest_scaling, \ 926 - fname, \ 927 - src_bpp_, \ 928 - mask_bpp_, \ 929 - dst_w_bpp_, \ 930 - flags, \ 931 - pixblock_size_, \ 932 - init, \ 933 - cleanup, \ 934 - process_pixblock_head, \ 935 - process_pixblock_tail, \ 936 - process_pixblock_tail_head, \ 937 - dst_w_basereg_ = 28, \ 938 - dst_r_basereg_ = 4, \ 939 - src_basereg_ = 0, \ 940 - mask_basereg_ = 24 941 - 942 - pixman_asm_function fname 943 - 944 - .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE 945 - /* 946 - * Make some macro arguments globally visible and accessible 947 - * from other macros 948 - */ 949 - .set src_bpp, src_bpp_ 950 - .set mask_bpp, mask_bpp_ 951 - .set dst_w_bpp, dst_w_bpp_ 952 - .set pixblock_size, pixblock_size_ 953 - .set dst_w_basereg, dst_w_basereg_ 954 - .set dst_r_basereg, dst_r_basereg_ 955 - .set src_basereg, src_basereg_ 956 - .set mask_basereg, mask_basereg_ 957 - 958 - .if use_nearest_scaling != 0 959 - /* 960 - * Assign symbolic names to registers for nearest scaling 961 - */ 962 - W .req r0 963 - DST_W .req r1 964 - SRC .req r2 965 - VX .req r3 966 - UNIT_X .req ip 967 - MASK .req lr 968 - TMP1 .req r4 969 - TMP2 .req r5 970 - DST_R .req r6 971 - SRC_WIDTH_FIXED .req r7 972 - 973 - .macro pixld_src x:vararg 974 - pixld_s x 975 - .endm 976 - 977 - ldr UNIT_X, [sp] 978 - push {r4-r8, lr} 979 - ldr SRC_WIDTH_FIXED, [sp, #(24 + 4)] 980 - .if mask_bpp != 0 981 - ldr MASK, [sp, #(24 + 8)] 982 - .endif 983 - .else 984 - /* 985 - * Assign symbolic names to registers 986 - */ 987 - W .req r0 /* width (is updated during processing) */ 988 - DST_W .req r1 /* destination buffer pointer for writes */ 989 - SRC .req r2 /* source buffer pointer */ 990 - DST_R .req ip /* destination buffer pointer for reads */ 991 - MASK .req r3 /* mask pointer */ 992 - 993 - .macro pixld_src x:vararg 994 - pixld x 995 - .endm 996 - .endif 997 - 998 - .if (((flags) & FLAG_DST_READWRITE) != 0) 999 - .set dst_r_bpp, dst_w_bpp 1000 - .else 1001 - .set dst_r_bpp, 0 1002 - .endif 1003 - .if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0) 1004 - .set DEINTERLEAVE_32BPP_ENABLED, 1 1005 - .else 1006 - .set DEINTERLEAVE_32BPP_ENABLED, 0 1007 - .endif 1008 - 1009 - .macro fetch_src_pixblock 1010 - pixld_src pixblock_size, src_bpp, \ 1011 - (src_basereg - pixblock_size * src_bpp / 64), SRC 1012 - .endm 1013 - 1014 - init 1015 - mov DST_R, DST_W 1016 - 1017 - cmp W, #pixblock_size 1018 - blt 8f 1019 - 1020 - ensure_destination_ptr_alignment process_pixblock_head, \ 1021 - process_pixblock_tail, \ 1022 - process_pixblock_tail_head 1023 - 1024 - subs W, W, #pixblock_size 1025 - blt 7f 1026 - 1027 - /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ 1028 - pixld_a pixblock_size, dst_r_bpp, \ 1029 - (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R 1030 - fetch_src_pixblock 1031 - pixld pixblock_size, mask_bpp, \ 1032 - (mask_basereg - pixblock_size * mask_bpp / 64), MASK 1033 - process_pixblock_head 1034 - subs W, W, #pixblock_size 1035 - blt 2f 1036 - 1: 1037 - process_pixblock_tail_head 1038 - subs W, W, #pixblock_size 1039 - bge 1b 1040 - 2: 1041 - process_pixblock_tail 1042 - pixst_a pixblock_size, dst_w_bpp, \ 1043 - (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W 1044 - 7: 1045 - /* Process the remaining trailing pixels in the scanline (dst aligned) */ 1046 - process_trailing_pixels 0, 1, \ 1047 - process_pixblock_head, \ 1048 - process_pixblock_tail, \ 1049 - process_pixblock_tail_head 1050 - 1051 - cleanup 1052 - .if use_nearest_scaling != 0 1053 - pop {r4-r8, pc} /* exit */ 1054 - .else 1055 - bx lr /* exit */ 1056 - .endif 1057 - 8: 1058 - /* Process the remaining trailing pixels in the scanline (dst unaligned) */ 1059 - process_trailing_pixels 0, 0, \ 1060 - process_pixblock_head, \ 1061 - process_pixblock_tail, \ 1062 - process_pixblock_tail_head 1063 - 1064 - cleanup 1065 - 1066 - .if use_nearest_scaling != 0 1067 - pop {r4-r8, pc} /* exit */ 1068 - 1069 - .unreq DST_R 1070 - .unreq SRC 1071 - .unreq W 1072 - .unreq VX 1073 - .unreq UNIT_X 1074 - .unreq TMP1 1075 - .unreq TMP2 1076 - .unreq DST_W 1077 - .unreq MASK 1078 - .unreq SRC_WIDTH_FIXED 1079 - 1080 - .else 1081 - bx lr /* exit */ 1082 - 1083 - .unreq SRC 1084 - .unreq MASK 1085 - .unreq DST_R 1086 - .unreq DST_W 1087 - .unreq W 1088 - .endif 1089 - 1090 - .purgem fetch_src_pixblock 1091 - .purgem pixld_src 1092 - 1093 - .endfunc 1094 - .endm 1095 - 1096 - .macro generate_composite_function_single_scanline x:vararg 1097 - generate_composite_function_scanline 0, x 1098 - .endm 1099 - 1100 - .macro generate_composite_function_nearest_scanline x:vararg 1101 - generate_composite_function_scanline 1, x 1102 - .endm 1103 - 1104 - /* Default prologue/epilogue, nothing special needs to be done */ 1105 - 1106 - .macro default_init 1107 - .endm 1108 - 1109 - .macro default_cleanup 1110 - .endm 1111 - 1112 - /* 1113 - * Prologue/epilogue variant which additionally saves/restores d8-d15 1114 - * registers (they need to be saved/restored by callee according to ABI). 1115 - * This is required if the code needs to use all the NEON registers. 1116 - */ 1117 - 1118 - .macro default_init_need_all_regs 1119 - vpush {d8-d15} 1120 - .endm 1121 - 1122 - .macro default_cleanup_need_all_regs 1123 - vpop {d8-d15} 1124 - .endm 1125 - 1126 - /******************************************************************************/ 1127 - 1128 - /* 1129 - * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in) 1130 - * into a planar a8r8g8b8 format (with a, r, g, b color components 1131 - * stored into 64-bit registers out_a, out_r, out_g, out_b respectively). 1132 - * 1133 - * Warning: the conversion is destructive and the original 1134 - * value (in) is lost. 1135 - */ 1136 - .macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b 1137 - vshrn.u16 out_r, in, #8 1138 - vshrn.u16 out_g, in, #3 1139 - vsli.u16 in, in, #5 1140 - vmov.u8 out_a, #255 1141 - vsri.u8 out_r, out_r, #5 1142 - vsri.u8 out_g, out_g, #6 1143 - vshrn.u16 out_b, in, #2 1144 - .endm 1145 - 1146 - .macro convert_0565_to_x888 in, out_r, out_g, out_b 1147 - vshrn.u16 out_r, in, #8 1148 - vshrn.u16 out_g, in, #3 1149 - vsli.u16 in, in, #5 1150 - vsri.u8 out_r, out_r, #5 1151 - vsri.u8 out_g, out_g, #6 1152 - vshrn.u16 out_b, in, #2 1153 - .endm 1154 - 1155 - /* 1156 - * Conversion from planar a8r8g8b8 format (with a, r, g, b color components 1157 - * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6 1158 - * pixels packed in 128-bit register (out). Requires two temporary 128-bit 1159 - * registers (tmp1, tmp2) 1160 - */ 1161 - .macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2 1162 - vshll.u8 tmp1, in_g, #8 1163 - vshll.u8 out, in_r, #8 1164 - vshll.u8 tmp2, in_b, #8 1165 - vsri.u16 out, tmp1, #5 1166 - vsri.u16 out, tmp2, #11 1167 - .endm 1168 - 1169 - /* 1170 - * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels 1171 - * returned in (out0, out1) registers pair. Requires one temporary 1172 - * 64-bit register (tmp). 'out1' and 'in' may overlap, the original 1173 - * value from 'in' is lost 1174 - */ 1175 - .macro convert_four_0565_to_x888_packed in, out0, out1, tmp 1176 - vshl.u16 out0, in, #5 /* G top 6 bits */ 1177 - vshl.u16 tmp, in, #11 /* B top 5 bits */ 1178 - vsri.u16 in, in, #5 /* R is ready in top bits */ 1179 - vsri.u16 out0, out0, #6 /* G is ready in top bits */ 1180 - vsri.u16 tmp, tmp, #5 /* B is ready in top bits */ 1181 - vshr.u16 out1, in, #8 /* R is in place */ 1182 - vsri.u16 out0, tmp, #8 /* G & B is in place */ 1183 - vzip.u16 out0, out1 /* everything is in place */ 1184 - .endm

-532

src/video/arm/pixman-arm-simd-asm.S

··· 1 - /* 2 - * Copyright (c) 2016 RISC OS Open Ltd 3 - * 4 - * This software is provided 'as-is', without any express or implied 5 - * warranty. In no event will the authors be held liable for any damages 6 - * arising from the use of this software. 7 - * 8 - * Permission is granted to anyone to use this software for any purpose, 9 - * including commercial applications, and to alter it and redistribute it 10 - * freely, subject to the following restrictions: 11 - * 12 - * 1. The origin of this software must not be misrepresented; you must not 13 - * claim that you wrote the original software. If you use this software 14 - * in a product, an acknowledgment in the product documentation would be 15 - * appreciated but is not required. 16 - * 2. Altered source versions must be plainly marked as such, and must not be 17 - * misrepresented as being the original software. 18 - * 3. This notice may not be removed or altered from any source distribution. 19 - */ 20 - 21 - /* Prevent the stack from becoming executable */ 22 - #if defined(__linux__) && defined(__ELF__) 23 - .section .note.GNU-stack,"",%progbits 24 - #endif 25 - 26 - .text 27 - .arch armv6 28 - .object_arch armv4 29 - .arm 30 - .altmacro 31 - .p2align 2 32 - 33 - #include "pixman-arm-asm.h" 34 - #include "pixman-arm-simd-asm.h" 35 - 36 - /* A head macro should do all processing which results in an output of up to 37 - * 16 bytes, as far as the final load instruction. The corresponding tail macro 38 - * should complete the processing of the up-to-16 bytes. The calling macro will 39 - * sometimes choose to insert a preload or a decrement of X between them. 40 - * cond ARM condition code for code block 41 - * numbytes Number of output bytes that should be generated this time 42 - * firstreg First WK register in which to place output 43 - * unaligned_src Whether to use non-wordaligned loads of source image 44 - * unaligned_mask Whether to use non-wordaligned loads of mask image 45 - * preload If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output 46 - */ 47 - 48 - /******************************************************************************/ 49 - 50 - .macro FillRect32_init 51 - ldr SRC, [sp, #ARGS_STACK_OFFSET] 52 - mov STRIDE_S, SRC 53 - mov MASK, SRC 54 - mov STRIDE_M, SRC 55 - .endm 56 - 57 - .macro FillRect16_init 58 - ldrh SRC, [sp, #ARGS_STACK_OFFSET] 59 - orr SRC, SRC, lsl #16 60 - mov STRIDE_S, SRC 61 - mov MASK, SRC 62 - mov STRIDE_M, SRC 63 - .endm 64 - 65 - .macro FillRect8_init 66 - ldrb SRC, [sp, #ARGS_STACK_OFFSET] 67 - orr SRC, SRC, lsl #8 68 - orr SRC, SRC, lsl #16 69 - mov STRIDE_S, SRC 70 - mov MASK, SRC 71 - mov STRIDE_M, SRC 72 - .endm 73 - 74 - .macro FillRect_process_tail cond, numbytes, firstreg 75 - WK4 .req SRC 76 - WK5 .req STRIDE_S 77 - WK6 .req MASK 78 - WK7 .req STRIDE_M 79 - pixst cond, numbytes, 4, DST 80 - .unreq WK4 81 - .unreq WK5 82 - .unreq WK6 83 - .unreq WK7 84 - .endm 85 - 86 - generate_composite_function \ 87 - FillSurfaceRect32ARMSIMDAsm, 0, 0, 32, \ 88 - FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ 89 - 0, /* prefetch distance doesn't apply */ \ 90 - FillRect32_init \ 91 - nop_macro, /* newline */ \ 92 - nop_macro /* cleanup */ \ 93 - nop_macro /* process head */ \ 94 - FillRect_process_tail 95 - 96 - generate_composite_function \ 97 - FillSurfaceRect16ARMSIMDAsm, 0, 0, 16, \ 98 - FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ 99 - 0, /* prefetch distance doesn't apply */ \ 100 - FillRect16_init \ 101 - nop_macro, /* newline */ \ 102 - nop_macro /* cleanup */ \ 103 - nop_macro /* process head */ \ 104 - FillRect_process_tail 105 - 106 - generate_composite_function \ 107 - FillSurfaceRect8ARMSIMDAsm, 0, 0, 8, \ 108 - FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ 109 - 0, /* prefetch distance doesn't apply */ \ 110 - FillRect8_init \ 111 - nop_macro, /* newline */ \ 112 - nop_macro /* cleanup */ \ 113 - nop_macro /* process head */ \ 114 - FillRect_process_tail 115 - 116 - /******************************************************************************/ 117 - 118 - /* This differs from the over_8888_8888 routine in Pixman in that the destination 119 - * alpha component is always left unchanged, and RGB components are not 120 - * premultiplied by alpha. It differs from BlitRGBtoRGBPixelAlpha in that 121 - * renormalisation is done by multiplying by 257/256 (with rounding) rather than 122 - * simply shifting right by 8 bits - removing the need to special-case alpha=0xff. 123 - */ 124 - 125 - .macro RGBtoRGBPixelAlpha_init 126 - line_saved_regs STRIDE_S, ORIG_W 127 - mov MASK, #0x80 128 - .endm 129 - 130 - .macro RGBtoRGBPixelAlpha_1pixel_translucent s, d, tmp0, tmp1, tmp2, tmp3, half 131 - uxtb tmp3, s 132 - uxtb tmp0, d 133 - sub tmp0, tmp3, tmp0 134 - uxtb tmp3, s, ror #16 135 - uxtb tmp1, d, ror #16 136 - sub tmp1, tmp3, tmp1 137 - uxtb tmp3, s, ror #8 138 - mov s, s, lsr #24 139 - uxtb tmp2, d, ror #8 140 - sub tmp2, tmp3, tmp2 141 - smlabb tmp0, tmp0, s, half 142 - smlabb tmp1, tmp1, s, half 143 - smlabb tmp2, tmp2, s, half 144 - add tmp0, tmp0, asr #8 145 - add tmp1, tmp1, asr #8 146 - add tmp2, tmp2, asr #8 147 - pkhbt tmp0, tmp0, tmp1, lsl #16 148 - and tmp2, tmp2, #0xff00 149 - uxtb16 tmp0, tmp0, ror #8 150 - orr tmp0, tmp0, tmp2 151 - uadd8 d, d, tmp0 152 - .endm 153 - 154 - .macro RGBtoRGBPixelAlpha_1pixel_opaque s, d 155 - and d, d, #0xff000000 156 - bic s, s, #0xff000000 157 - orr d, d, s 158 - .endm 159 - 160 - .macro RGBtoRGBPixelAlpha_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 161 - .if numbytes == 16 162 - ldm SRC!, {WK0, WK1} 163 - ldm SRC!, {STRIDE_S, STRIDE_M} 164 - ldrd WK2, WK3, [DST], #16 165 - orr SCRATCH, WK0, WK1 166 - and ORIG_W, WK0, WK1 167 - orr SCRATCH, SCRATCH, STRIDE_S 168 - and ORIG_W, ORIG_W, STRIDE_S 169 - orr SCRATCH, SCRATCH, STRIDE_M 170 - and ORIG_W, ORIG_W, STRIDE_M 171 - tst SCRATCH, #0xff000000 172 - .elseif numbytes == 8 173 - ldm SRC!, {WK0, WK1} 174 - ldm DST!, {WK2, WK3} 175 - orr SCRATCH, WK0, WK1 176 - and ORIG_W, WK0, WK1 177 - tst SCRATCH, #0xff000000 178 - .else // numbytes == 4 179 - ldr WK0, [SRC], #4 180 - ldr WK2, [DST], #4 181 - tst WK0, #0xff000000 182 - .endif 183 - .endm 184 - 185 - .macro RGBtoRGBPixelAlpha_process_tail cond, numbytes, firstreg 186 - beq 20f @ all transparent 187 - .if numbytes == 16 188 - cmp ORIG_W, #0xff000000 189 - bhs 10f @ all opaque 190 - RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK 191 - RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK 192 - strd WK2, WK3, [DST, #-16] 193 - ldrd WK0, WK1, [SRC, #-8] 194 - ldrd WK2, WK3, [DST, #-8] 195 - RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK 196 - RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK 197 - b 19f 198 - 10: RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2 199 - RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3 200 - strd WK2, WK3, [DST, #-16] 201 - ldrd WK0, WK1, [SRC, #-8] 202 - ldrd WK2, WK3, [DST, #-8] 203 - RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2 204 - RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3 205 - 19: strd WK2, WK3, [DST, #-8] 206 - .elseif numbytes == 8 207 - cmp ORIG_W, #0xff000000 208 - bhs 10f @ all opaque 209 - RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK 210 - RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK 211 - b 19f 212 - 10: RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2 213 - RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3 214 - 19: strd WK2, WK3, [DST, #-8] 215 - .else // numbytes == 4 216 - cmp WK0, #0xff000000 217 - bhs 10f @ opaque 218 - RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK 219 - b 19f 220 - 10: RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2 221 - 19: str WK2, [DST, #-4] 222 - .endif 223 - 20: 224 - .endm 225 - 226 - generate_composite_function \ 227 - BlitRGBtoRGBPixelAlphaARMSIMDAsm, 32, 0, 32, \ 228 - FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \ 229 - 2, /* prefetch distance */ \ 230 - RGBtoRGBPixelAlpha_init, \ 231 - nop_macro, /* newline */ \ 232 - nop_macro, /* cleanup */ \ 233 - RGBtoRGBPixelAlpha_process_head, \ 234 - RGBtoRGBPixelAlpha_process_tail 235 - 236 - /******************************************************************************/ 237 - 238 - .macro ARGBto565PixelAlpha_init 239 - line_saved_regs STRIDE_D, STRIDE_S, ORIG_W 240 - mov MASK, #0x001f 241 - mov STRIDE_M, #0x0010 242 - orr MASK, MASK, MASK, lsl #16 243 - orr STRIDE_M, STRIDE_M, STRIDE_M, lsl #16 244 - .endm 245 - 246 - .macro ARGBto565PixelAlpha_newline 247 - mov STRIDE_S, #0x0200 248 - .endm 249 - 250 - /* On entry: 251 - * s1 holds 1 32bpp source pixel 252 - * d holds 1 16bpp destination pixel 253 - * rbmask, rbhalf, ghalf hold 0x001f001f, 0x00100010, 0x00000200 respectively 254 - * other registers are temporaries 255 - * On exit: 256 - * Constant registers preserved 257 - */ 258 - 259 - .macro ARGBto565PixelAlpha_1pixel_translucent s, d, rbmask, rbhalf, ghalf, alpha, rb, g, misc 260 - mov alpha, s, lsr #27 261 - and misc, s, #0xfc00 262 - and g, d, #0x07e0 263 - pkhbt rb, d, d, lsl #5 264 - rsb misc, g, misc, lsr #5 265 - and s, rbmask, s, lsr #3 266 - and rb, rbmask, rb 267 - sub s, s, rb 268 - smlabb misc, misc, alpha, ghalf 269 - mla s, s, alpha, rbhalf 270 - add misc, misc, misc, lsl #5 271 - add g, g, misc, asr #10 272 - add s, s, s, lsl #5 273 - and g, g, #0x07e0 274 - add rb, rb, s, asr #10 275 - and rb, rb, rbmask 276 - pkhbt rb, rb, rb, lsl #11 277 - orr d, rb, g 278 - orr d, d, rb, lsr #16 279 - .endm 280 - 281 - /* On entry: 282 - * s1 holds 1 32bpp source pixel 283 - * d holds 1 16bpp destination pixel 284 - * rbmask holds 0x001f001f 285 - * On exit: 286 - * Constant registers preserved 287 - */ 288 - 289 - .macro ARGBto565PixelAlpha_1pixel_opaque s, d, rbmask 290 - and d, rbmask, s, lsr #3 291 - and s, s, #0xfc00 292 - orr d, d, d, lsr #5 293 - orr d, d, s, lsr #5 294 - .endm 295 - 296 - /* On entry: 297 - * s1, s2 hold 2 32bpp source pixels 298 - * d holds 2 16bpp destination pixels 299 - * rbmask, rbhalf, ghalf hold 0x001f001f, 0x00100010, 0x00000200 respectively 300 - * other registers are temporaries 301 - * On exit: 302 - * Constant registers preserved 303 - * Blended results have been written through destination pointer 304 - */ 305 - 306 - .macro ARGBto565PixelAlpha_2pixels_translucent s1, s2, d, rbmask, rbhalf, ghalf, alpha, rb, g, misc 307 - mov alpha, s1, lsr #27 308 - and misc, s1, #0xfc00 309 - and g, d, #0x07e0 310 - pkhbt rb, d, d, lsl #5 311 - rsb misc, g, misc, lsr #5 312 - and s1, rbmask, s1, lsr #3 313 - and rb, rbmask, rb 314 - sub s1, s1, rb 315 - smlabb misc, misc, alpha, ghalf 316 - mla s1, s1, alpha, rbhalf 317 - uxth d, d, ror #16 318 - add misc, misc, misc, lsl #5 319 - mov alpha, s2, lsr #27 320 - add g, g, misc, asr #10 321 - add s1, s1, s1, lsl #5 322 - and g, g, #0x07e0 323 - add rb, rb, s1, asr #10 324 - and rb, rb, rbmask 325 - and misc, s2, #0xfc00 326 - pkhbt rb, rb, rb, lsl #11 327 - and s1, d, #0x07e0 328 - pkhbt d, d, d, lsl #5 329 - rsb misc, s1, misc, lsr #5 330 - and s2, rbmask, s2, lsr #3 331 - and d, rbmask, d 332 - sub s2, s2, d 333 - smlabb misc, misc, alpha, ghalf 334 - mla s2, s2, alpha, rbhalf 335 - orr alpha, rb, g 336 - add misc, misc, misc, lsl #5 337 - orr alpha, alpha, rb, lsr #16 338 - add s1, s1, misc, asr #10 339 - add s2, s2, s2, lsl #5 340 - and s1, s1, #0x07e0 341 - add d, d, s2, asr #10 342 - and d, d, rbmask 343 - strh alpha, [DST, #-4] 344 - pkhbt d, d, d, lsl #11 345 - orr alpha, d, s1 346 - orr alpha, alpha, d, lsr #16 347 - strh alpha, [DST, #-2] 348 - .endm 349 - 350 - /* On entry: 351 - * s1, s2 hold 2 32bpp source pixels 352 - * rbmask holds 0x001f001f 353 - * other registers are temporaries 354 - * On exit: 355 - * Constant registers preserved 356 - * Blended results have been written through destination pointer 357 - */ 358 - 359 - .macro ARGBto565PixelAlpha_2pixels_opaque s1, s2, d, rbmask, g 360 - and g, s1, #0xfc00 361 - and d, rbmask, s1, lsr #3 362 - and s1, rbmask, s2, lsr #3 363 - orr d, d, d, lsr #5 364 - orr d, d, g, lsr #5 365 - and g, s2, #0xfc00 366 - strh d, [DST, #-4] 367 - orr s1, s1, s1, lsr #5 368 - orr s1, s1, g, lsr #5 369 - strh s1, [DST, #-2] 370 - .endm 371 - 372 - .macro ARGBto565PixelAlpha_2pixels_head 373 - ldrd WK0, WK1, [SRC], #8 374 - ldr WK2, [DST], #4 375 - orr SCRATCH, WK0, WK1 376 - and ORIG_W, WK0, WK1 377 - tst SCRATCH, #0xff000000 378 - .endm 379 - 380 - .macro ARGBto565PixelAlpha_2pixels_tail 381 - beq 20f @ all transparent 382 - cmp ORIG_W, #0xff000000 383 - bhs 10f @ all opaque 384 - ARGBto565PixelAlpha_2pixels_translucent WK0, WK1, WK2, MASK, STRIDE_M, STRIDE_S, STRIDE_D, WK3, SCRATCH, ORIG_W 385 - b 20f 386 - 10: ARGBto565PixelAlpha_2pixels_opaque WK0, WK1, WK2, MASK, SCRATCH 387 - 20: 388 - .endm 389 - 390 - .macro ARGBto565PixelAlpha_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 391 - .if numbytes == 16 392 - ARGBto565PixelAlpha_2pixels_head 393 - ARGBto565PixelAlpha_2pixels_tail 394 - ARGBto565PixelAlpha_2pixels_head 395 - ARGBto565PixelAlpha_2pixels_tail 396 - .endif 397 - .if numbytes >= 8 398 - ARGBto565PixelAlpha_2pixels_head 399 - ARGBto565PixelAlpha_2pixels_tail 400 - .endif 401 - .if numbytes >= 4 402 - ARGBto565PixelAlpha_2pixels_head 403 - .else // numbytes == 2 404 - ldr WK0, [SRC], #4 405 - ldrh WK2, [DST], #2 406 - tst WK0, #0xff000000 407 - .endif 408 - .endm 409 - 410 - .macro ARGBto565PixelAlpha_process_tail cond, numbytes, firstreg 411 - .if numbytes >= 4 412 - ARGBto565PixelAlpha_2pixels_tail 413 - .else // numbytes == 2 414 - beq 20f @ all transparent 415 - cmp WK0, #0xff000000 416 - bhs 10f @ opaque 417 - ARGBto565PixelAlpha_1pixel_translucent WK0, WK2, MASK, STRIDE_M, STRIDE_S, STRIDE_D, WK3, SCRATCH, ORIG_W 418 - b 19f 419 - 10: ARGBto565PixelAlpha_1pixel_opaque WK0, WK2, MASK 420 - 19: strh WK2, [DST, #-2] 421 - 20: 422 - .endif 423 - .endm 424 - 425 - generate_composite_function \ 426 - BlitARGBto565PixelAlphaARMSIMDAsm, 32, 0, 16, \ 427 - FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \ 428 - 2, /* prefetch distance */ \ 429 - ARGBto565PixelAlpha_init, \ 430 - ARGBto565PixelAlpha_newline, \ 431 - nop_macro, /* cleanup */ \ 432 - ARGBto565PixelAlpha_process_head, \ 433 - ARGBto565PixelAlpha_process_tail 434 - 435 - /******************************************************************************/ 436 - 437 - .macro BGR888toRGB888_1pixel cond, reg, tmp 438 - uxtb16&cond tmp, WK&reg, ror #8 439 - uxtb16&cond WK&reg, WK&reg, ror #16 440 - orr&cond WK&reg, WK&reg, tmp, lsl #8 441 - .endm 442 - 443 - .macro BGR888toRGB888_2pixels cond, reg1, reg2, tmp1, tmp2 444 - uxtb16&cond tmp1, WK&reg1, ror #8 445 - uxtb16&cond WK&reg1, WK&reg1, ror #16 446 - uxtb16&cond tmp2, WK&reg2, ror #8 447 - uxtb16&cond WK&reg2, WK&reg2, ror #16 448 - orr&cond WK&reg1, WK&reg1, tmp1, lsl #8 449 - orr&cond WK&reg2, WK&reg2, tmp2, lsl #8 450 - .endm 451 - 452 - .macro BGR888toRGB888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 453 - pixld cond, numbytes, firstreg, SRC, unaligned_src 454 - .endm 455 - 456 - .macro BGR888toRGB888_process_tail cond, numbytes, firstreg 457 - .if numbytes >= 8 458 - BGR888toRGB888_2pixels cond, %(firstreg+0), %(firstreg+1), MASK, STRIDE_M 459 - .if numbytes == 16 460 - BGR888toRGB888_2pixels cond, %(firstreg+2), %(firstreg+3), MASK, STRIDE_M 461 - .endif 462 - .else @ numbytes == 4 463 - BGR888toRGB888_1pixel cond, %(firstreg+0), MASK 464 - .endif 465 - .endm 466 - 467 - generate_composite_function \ 468 - Blit_XBGR8888_XRGB8888ARMSIMDAsm, 32, 0, 32, \ 469 - FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \ 470 - 2, /* prefetch distance */ \ 471 - nop_macro, /* init */ \ 472 - nop_macro, /* newline */ \ 473 - nop_macro, /* cleanup */ \ 474 - BGR888toRGB888_process_head, \ 475 - BGR888toRGB888_process_tail 476 - 477 - /******************************************************************************/ 478 - 479 - .macro RGB444toRGB888_init 480 - ldr MASK, =0x0f0f0f0f 481 - /* Set GE[3:0] to 0101 so SEL instructions do what we want */ 482 - msr CPSR_s, #0x50000 483 - .endm 484 - 485 - .macro RGB444toRGB888_1pixel reg, mask, tmp 486 - pkhbt WK&reg, WK&reg, WK&reg, lsl #12 @ 0000aaaarrrrggggaaaarrrrggggbbbb 487 - and WK&reg, mask, WK&reg @ 0000aaaa0000gggg0000rrrr0000bbbb 488 - orr WK&reg, WK&reg, WK&reg, lsl #4 @ aaaaaaaaggggggggrrrrrrrrbbbbbbbb 489 - pkhtb tmp, WK&reg, WK&reg, asr #8 @ aaaaaaaaggggggggggggggggrrrrrrrr 490 - pkhbt WK&reg, WK&reg, WK&reg, lsl #8 @ ggggggggrrrrrrrrrrrrrrrrbbbbbbbb 491 - sel WK&reg, WK&reg, tmp @ aaaaaaaarrrrrrrrggggggggbbbbbbbb 492 - .endm 493 - 494 - .macro RGB444toRGB888_2pixels in, out1, out2, mask, tmp1, tmp2 495 - and tmp1, mask, WK&in @ 0000RRRR0000BBBB0000rrrr0000bbbb 496 - and tmp2, mask, WK&in, lsr #4 @ 0000AAAA0000GGGG0000aaaa0000gggg 497 - orr tmp1, tmp1, tmp1, lsl #4 @ RRRRRRRRBBBBBBBBrrrrrrrrbbbbbbbb 498 - orr tmp2, tmp2, tmp2, lsl #4 @ AAAAAAAAGGGGGGGGaaaaaaaagggggggg 499 - pkhtb WK&out2, tmp2, tmp1, asr #16 @ AAAAAAAAGGGGGGGGRRRRRRRRBBBBBBBB 500 - pkhbt WK&out1, tmp1, tmp2, lsl #16 @ aaaaaaaaggggggggrrrrrrrrbbbbbbbb 501 - pkhtb tmp2, WK&out2, WK&out2, asr #8 @ AAAAAAAAGGGGGGGGGGGGGGGGRRRRRRRR 502 - pkhtb tmp1, WK&out1, WK&out1, asr #8 @ aaaaaaaaggggggggggggggggrrrrrrrr 503 - pkhbt WK&out1, WK&out1, WK&out1, lsl #8 @ ggggggggrrrrrrrrrrrrrrrrbbbbbbbb 504 - pkhbt WK&out2, WK&out2, WK&out2, lsl #8 @ GGGGGGGGRRRRRRRRRRRRRRRRBBBBBBBB 505 - sel WK&out1, WK&out1, tmp1 @ aaaaaaaarrrrrrrrggggggggbbbbbbbb 506 - sel WK&out2, WK&out2, tmp2 @ AAAAAAAARRRRRRRRGGGGGGGGBBBBBBBB 507 - .endm 508 - 509 - .macro RGB444toRGB888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 510 - pixld cond, numbytes/2, firstreg, SRC, unaligned_src 511 - .endm 512 - 513 - .macro RGB444toRGB888_process_tail cond, numbytes, firstreg 514 - .if numbytes >= 8 515 - .if numbytes == 16 516 - RGB444toRGB888_2pixels %(firstreg+1), %(firstreg+2), %(firstreg+3), MASK, STRIDE_M, SCRATCH 517 - .endif 518 - RGB444toRGB888_2pixels %(firstreg+0), %(firstreg+0), %(firstreg+1), MASK, STRIDE_M, SCRATCH 519 - .else @ numbytes == 4 520 - RGB444toRGB888_1pixel %(firstreg+0), MASK, SCRATCH 521 - .endif 522 - .endm 523 - 524 - generate_composite_function \ 525 - Blit_RGB444_XRGB8888ARMSIMDAsm, 16, 0, 32, \ 526 - FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \ 527 - 2, /* prefetch distance */ \ 528 - RGB444toRGB888_init, \ 529 - nop_macro, /* newline */ \ 530 - nop_macro, /* cleanup */ \ 531 - RGB444toRGB888_process_head, \ 532 - RGB444toRGB888_process_tail

-1034

src/video/arm/pixman-arm-simd-asm.h

··· 1 - /* 2 - * Copyright (c) 2012 Raspberry Pi Foundation 3 - * Copyright (c) 2012 RISC OS Open Ltd 4 - * 5 - * This software is provided 'as-is', without any express or implied 6 - * warranty. In no event will the authors be held liable for any damages 7 - * arising from the use of this software. 8 - * 9 - * Permission is granted to anyone to use this software for any purpose, 10 - * including commercial applications, and to alter it and redistribute it 11 - * freely, subject to the following restrictions: 12 - * 13 - * 1. The origin of this software must not be misrepresented; you must not 14 - * claim that you wrote the original software. If you use this software 15 - * in a product, an acknowledgment in the product documentation would be 16 - * appreciated but is not required. 17 - * 2. Altered source versions must be plainly marked as such, and must not be 18 - * misrepresented as being the original software. 19 - * 3. This notice may not be removed or altered from any source distribution. 20 - */ 21 - 22 - /* 23 - * Because the alignment of pixel data to cachelines, and even the number of 24 - * cachelines per row can vary from row to row, and because of the need to 25 - * preload each scanline once and only once, this prefetch strategy treats 26 - * each row of pixels independently. When a pixel row is long enough, there 27 - * are three distinct phases of prefetch: 28 - * * an inner loop section, where each time a cacheline of data is 29 - * processed, another cacheline is preloaded (the exact distance ahead is 30 - * determined empirically using profiling results from lowlevel-blt-bench) 31 - * * a leading section, where enough cachelines are preloaded to ensure no 32 - * cachelines escape being preloaded when the inner loop starts 33 - * * a trailing section, where a limited number (0 or more) of cachelines 34 - * are preloaded to deal with data (if any) that hangs off the end of the 35 - * last iteration of the inner loop, plus any trailing bytes that were not 36 - * enough to make up one whole iteration of the inner loop 37 - * 38 - * There are (in general) three distinct code paths, selected between 39 - * depending upon how long the pixel row is. If it is long enough that there 40 - * is at least one iteration of the inner loop (as described above) then 41 - * this is described as the "wide" case. If it is shorter than that, but 42 - * there are still enough bytes output that there is at least one 16-byte- 43 - * long, 16-byte-aligned write to the destination (the optimum type of 44 - * write), then this is the "medium" case. If it is not even this long, then 45 - * this is the "narrow" case, and there is no attempt to align writes to 46 - * 16-byte boundaries. In the "medium" and "narrow" cases, all the 47 - * cachelines containing data from the pixel row are prefetched up-front. 48 - */ 49 - 50 - /* 51 - * Determine whether we put the arguments on the stack for debugging. 52 - */ 53 - #undef DEBUG_PARAMS 54 - 55 - /* 56 - * Bit flags for 'generate_composite_function' macro which are used 57 - * to tune generated functions behavior. 58 - */ 59 - .set FLAG_DST_WRITEONLY, 0 60 - .set FLAG_DST_READWRITE, 1 61 - .set FLAG_COND_EXEC, 0 62 - .set FLAG_BRANCH_OVER, 2 63 - .set FLAG_PROCESS_PRESERVES_PSR, 0 64 - .set FLAG_PROCESS_CORRUPTS_PSR, 4 65 - .set FLAG_PROCESS_DOESNT_STORE, 0 66 - .set FLAG_PROCESS_DOES_STORE, 8 /* usually because it needs to conditionally skip it */ 67 - .set FLAG_NO_SPILL_LINE_VARS, 0 68 - .set FLAG_SPILL_LINE_VARS_WIDE, 16 69 - .set FLAG_SPILL_LINE_VARS_NON_WIDE, 32 70 - .set FLAG_SPILL_LINE_VARS, 48 71 - .set FLAG_PROCESS_CORRUPTS_SCRATCH, 0 72 - .set FLAG_PROCESS_PRESERVES_SCRATCH, 64 73 - .set FLAG_PROCESS_PRESERVES_WK0, 0 74 - .set FLAG_PROCESS_CORRUPTS_WK0, 128 /* if possible, use the specified register(s) instead so WK0 can hold number of leading pixels */ 75 - .set FLAG_PRELOAD_DST, 0 76 - .set FLAG_NO_PRELOAD_DST, 256 77 - 78 - /* 79 - * Number of bytes by which to adjust preload offset of destination 80 - * buffer (allows preload instruction to be moved before the load(s)) 81 - */ 82 - .set DST_PRELOAD_BIAS, 0 83 - 84 - /* 85 - * Offset into stack where mask and source pointer/stride can be accessed. 86 - */ 87 - #ifdef DEBUG_PARAMS 88 - .set ARGS_STACK_OFFSET, (9*4+9*4) 89 - #else 90 - .set ARGS_STACK_OFFSET, (9*4) 91 - #endif 92 - 93 - /* 94 - * Offset into stack where space allocated during init macro can be accessed. 95 - */ 96 - .set LOCALS_STACK_OFFSET, 0 97 - 98 - /* 99 - * Constants for selecting preferable prefetch type. 100 - */ 101 - .set PREFETCH_TYPE_NONE, 0 102 - .set PREFETCH_TYPE_STANDARD, 1 103 - 104 - /* 105 - * Definitions of macros for load/store of pixel data. 106 - */ 107 - 108 - .macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0 109 - .if numbytes == 16 110 - .if unaligned == 1 111 - op&r&cond WK&reg0, [base], #4 112 - op&r&cond WK&reg1, [base], #4 113 - op&r&cond WK&reg2, [base], #4 114 - op&r&cond WK&reg3, [base], #4 115 - .else 116 - op&m&cond&ia base!, {WK&reg0,WK&reg1,WK&reg2,WK&reg3} 117 - .endif 118 - .elseif numbytes == 8 119 - .if unaligned == 1 120 - op&r&cond WK&reg0, [base], #4 121 - op&r&cond WK&reg1, [base], #4 122 - .else 123 - op&m&cond&ia base!, {WK&reg0,WK&reg1} 124 - .endif 125 - .elseif numbytes == 4 126 - op&r&cond WK&reg0, [base], #4 127 - .elseif numbytes == 2 128 - op&r&cond&h WK&reg0, [base], #2 129 - .elseif numbytes == 1 130 - op&r&cond&b WK&reg0, [base], #1 131 - .else 132 - .error "unsupported size: numbytes" 133 - .endif 134 - .endm 135 - 136 - .macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base 137 - .if numbytes == 16 138 - stm&cond&db base, {WK&reg0,WK&reg1,WK&reg2,WK&reg3} 139 - .elseif numbytes == 8 140 - stm&cond&db base, {WK&reg0,WK&reg1} 141 - .elseif numbytes == 4 142 - str&cond WK&reg0, [base, #-4] 143 - .elseif numbytes == 2 144 - str&cond&h WK&reg0, [base, #-2] 145 - .elseif numbytes == 1 146 - str&cond&b WK&reg0, [base, #-1] 147 - .else 148 - .error "unsupported size: numbytes" 149 - .endif 150 - .endm 151 - 152 - .macro pixld cond, numbytes, firstreg, base, unaligned 153 - pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned 154 - .endm 155 - 156 - .macro pixst cond, numbytes, firstreg, base 157 - .if (flags) & FLAG_DST_READWRITE 158 - pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base 159 - .else 160 - pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base 161 - .endif 162 - .endm 163 - 164 - .macro PF a, x:vararg 165 - .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD) 166 - a x 167 - .endif 168 - .endm 169 - 170 - 171 - .macro preload_leading_step1 bpp, ptr, base 172 - /* If the destination is already 16-byte aligned, then we need to preload 173 - * between 0 and prefetch_distance (inclusive) cache lines ahead so there 174 - * are no gaps when the inner loop starts. 175 - */ 176 - .if bpp > 0 177 - PF bic, ptr, base, #31 178 - .set OFFSET, 0 179 - .rept prefetch_distance+1 180 - PF pld, [ptr, #OFFSET] 181 - .set OFFSET, OFFSET+32 182 - .endr 183 - .endif 184 - .endm 185 - 186 - .macro preload_leading_step2 bpp, bpp_shift, ptr, base 187 - /* However, if the destination is not 16-byte aligned, we may need to 188 - * preload more cache lines than that. The question we need to ask is: 189 - * are the bytes corresponding to the leading pixels more than the amount 190 - * by which the source pointer will be rounded down for preloading, and if 191 - * so, by how many cache lines? Effectively, we want to calculate 192 - * leading_bytes = ((-dst)&15)*src_bpp/dst_bpp 193 - * inner_loop_offset = (src+leading_bytes)&31 194 - * extra_needed = leading_bytes - inner_loop_offset 195 - * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only 196 - * possible when there are 4 src bytes for every 1 dst byte). 197 - */ 198 - .if bpp > 0 199 - .ifc base,DST 200 - /* The test can be simplified further when preloading the destination */ 201 - PF tst, base, #16 202 - PF beq, 61f 203 - .else 204 - .if bpp/dst_w_bpp == 4 205 - PF add, SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift 206 - PF and, SCRATCH, SCRATCH, #31 207 - PF rsb, SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift 208 - PF sub, SCRATCH, SCRATCH, #1 /* so now ranges are -16..-1 / 0..31 / 32..63 */ 209 - PF movs, SCRATCH, SCRATCH, lsl #32-6 /* so this sets NC / nc / Nc */ 210 - PF bcs, 61f 211 - PF bpl, 60f 212 - PF pld, [ptr, #32*(prefetch_distance+2)] 213 - .else 214 - PF mov, SCRATCH, base, lsl #32-5 215 - PF add, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift 216 - PF rsbs, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift 217 - PF bls, 61f 218 - .endif 219 - .endif 220 - 60: PF pld, [ptr, #32*(prefetch_distance+1)] 221 - 61: 222 - .endif 223 - .endm 224 - 225 - #define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2)) 226 - .macro preload_middle bpp, base, scratch_holds_offset 227 - .if bpp > 0 228 - /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */ 229 - .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp) 230 - .if scratch_holds_offset 231 - PF pld, [base, SCRATCH] 232 - .else 233 - PF bic, SCRATCH, base, #31 234 - PF pld, [SCRATCH, #32*prefetch_distance] 235 - .endif 236 - .endif 237 - .endif 238 - .endm 239 - 240 - .macro preload_trailing bpp, bpp_shift, base 241 - .if bpp > 0 242 - .if bpp*pix_per_block > 256 243 - /* Calculations are more complex if more than one fetch per block */ 244 - PF and, WK1, base, #31 245 - PF add, WK1, WK1, WK0, lsl #bpp_shift 246 - PF add, WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1) 247 - PF bic, SCRATCH, base, #31 248 - 80: PF pld, [SCRATCH, #32*(prefetch_distance+1)] 249 - PF add, SCRATCH, SCRATCH, #32 250 - PF subs, WK1, WK1, #32 251 - PF bhi, 80b 252 - .else 253 - /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */ 254 - PF mov, SCRATCH, base, lsl #32-5 255 - PF adds, SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift 256 - PF adceqs, SCRATCH, SCRATCH, #0 257 - /* The instruction above has two effects: ensures Z is only 258 - * set if C was clear (so Z indicates that both shifted quantities 259 - * were 0), and clears C if Z was set (so C indicates that the sum 260 - * of the shifted quantities was greater and not equal to 32) */ 261 - PF beq, 82f 262 - PF bic, SCRATCH, base, #31 263 - PF bcc, 81f 264 - PF pld, [SCRATCH, #32*(prefetch_distance+2)] 265 - 81: PF pld, [SCRATCH, #32*(prefetch_distance+1)] 266 - 82: 267 - .endif 268 - .endif 269 - .endm 270 - 271 - 272 - .macro preload_line narrow_case, bpp, bpp_shift, base 273 - /* "narrow_case" - just means that the macro was invoked from the "narrow" 274 - * code path rather than the "medium" one - because in the narrow case, 275 - * the row of pixels is known to output no more than 30 bytes, then 276 - * (assuming the source pixels are no wider than the the destination 277 - * pixels) they cannot possibly straddle more than 2 32-byte cachelines, 278 - * meaning there's no need for a loop. 279 - * "bpp" - number of bits per pixel in the channel (source, mask or 280 - * destination) that's being preloaded, or 0 if this channel is not used 281 - * for reading 282 - * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course) 283 - * "base" - base address register of channel to preload (SRC, MASK or DST) 284 - */ 285 - .if bpp > 0 286 - .if narrow_case && (bpp <= dst_w_bpp) 287 - /* In these cases, each line for each channel is in either 1 or 2 cache lines */ 288 - PF bic, WK0, base, #31 289 - PF pld, [WK0] 290 - PF add, WK1, base, X, LSL #bpp_shift 291 - PF sub, WK1, WK1, #1 292 - PF bic, WK1, WK1, #31 293 - PF cmp, WK1, WK0 294 - PF beq, 90f 295 - PF pld, [WK1] 296 - 90: 297 - .else 298 - PF bic, WK0, base, #31 299 - PF pld, [WK0] 300 - PF add, WK1, base, X, lsl #bpp_shift 301 - PF sub, WK1, WK1, #1 302 - PF bic, WK1, WK1, #31 303 - PF cmp, WK1, WK0 304 - PF beq, 92f 305 - 91: PF add, WK0, WK0, #32 306 - PF cmp, WK0, WK1 307 - PF pld, [WK0] 308 - PF bne, 91b 309 - 92: 310 - .endif 311 - .endif 312 - .endm 313 - 314 - 315 - .macro conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx 316 - process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0 317 - .if decrementx 318 - sub&cond X, X, #8*numbytes/dst_w_bpp 319 - .endif 320 - process_tail cond, numbytes, firstreg 321 - .if !((flags) & FLAG_PROCESS_DOES_STORE) 322 - pixst cond, numbytes, firstreg, DST 323 - .endif 324 - .endm 325 - 326 - .macro conditional_process1 cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx 327 - .if (flags) & FLAG_BRANCH_OVER 328 - .ifc cond,mi 329 - bpl 100f 330 - .endif 331 - .ifc cond,cs 332 - bcc 100f 333 - .endif 334 - .ifc cond,ne 335 - beq 100f 336 - .endif 337 - conditional_process1_helper , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx 338 - 100: 339 - .else 340 - conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx 341 - .endif 342 - .endm 343 - 344 - .macro conditional_process2 test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx 345 - .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE) 346 - /* Can't interleave reads and writes */ 347 - test 348 - conditional_process1 cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx 349 - .if (flags) & FLAG_PROCESS_CORRUPTS_PSR 350 - test 351 - .endif 352 - conditional_process1 cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx 353 - .else 354 - /* Can interleave reads and writes for better scheduling */ 355 - test 356 - process_head cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0 357 - process_head cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0 358 - .if decrementx 359 - sub&cond1 X, X, #8*numbytes1/dst_w_bpp 360 - sub&cond2 X, X, #8*numbytes2/dst_w_bpp 361 - .endif 362 - process_tail cond1, numbytes1, firstreg1 363 - process_tail cond2, numbytes2, firstreg2 364 - pixst cond1, numbytes1, firstreg1, DST 365 - pixst cond2, numbytes2, firstreg2, DST 366 - .endif 367 - .endm 368 - 369 - 370 - .macro test_bits_1_0_ptr 371 - .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 372 - movs SCRATCH, X, lsl #32-1 /* C,N = bits 1,0 of DST */ 373 - .else 374 - movs SCRATCH, WK0, lsl #32-1 /* C,N = bits 1,0 of DST */ 375 - .endif 376 - .endm 377 - 378 - .macro test_bits_3_2_ptr 379 - .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 380 - movs SCRATCH, X, lsl #32-3 /* C,N = bits 3, 2 of DST */ 381 - .else 382 - movs SCRATCH, WK0, lsl #32-3 /* C,N = bits 3, 2 of DST */ 383 - .endif 384 - .endm 385 - 386 - .macro leading_15bytes process_head, process_tail 387 - /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */ 388 - .set DECREMENT_X, 1 389 - .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 390 - .set DECREMENT_X, 0 391 - sub X, X, WK0, lsr #dst_bpp_shift 392 - str X, [sp, #LINE_SAVED_REG_COUNT*4] 393 - mov X, WK0 394 - .endif 395 - /* Use unaligned loads in all cases for simplicity */ 396 - .if dst_w_bpp == 8 397 - conditional_process2 test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X 398 - .elseif dst_w_bpp == 16 399 - test_bits_1_0_ptr 400 - conditional_process1 cs, process_head, process_tail, 2, 2, 1, 1, DECREMENT_X 401 - .endif 402 - conditional_process2 test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X 403 - .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 404 - ldr X, [sp, #LINE_SAVED_REG_COUNT*4] 405 - .endif 406 - .endm 407 - 408 - .macro test_bits_3_2_pix 409 - movs SCRATCH, X, lsl #dst_bpp_shift+32-3 410 - .endm 411 - 412 - .macro test_bits_1_0_pix 413 - .if dst_w_bpp == 8 414 - movs SCRATCH, X, lsl #dst_bpp_shift+32-1 415 - .else 416 - movs SCRATCH, X, lsr #1 417 - .endif 418 - .endm 419 - 420 - .macro trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask 421 - conditional_process2 test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0 422 - .if dst_w_bpp == 16 423 - test_bits_1_0_pix 424 - conditional_process1 cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0 425 - .elseif dst_w_bpp == 8 426 - conditional_process2 test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0 427 - .endif 428 - .endm 429 - 430 - 431 - .macro wide_case_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment 432 - 110: 433 - .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */ 434 - .rept pix_per_block*dst_w_bpp/128 435 - process_head , 16, 0, unaligned_src, unaligned_mask, 1 436 - .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) 437 - preload_middle src_bpp, SRC, 1 438 - .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) 439 - preload_middle mask_bpp, MASK, 1 440 - .else 441 - preload_middle src_bpp, SRC, 0 442 - preload_middle mask_bpp, MASK, 0 443 - .endif 444 - .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) && (((flags) & FLAG_NO_PRELOAD_DST) == 0) 445 - /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that 446 - * destination prefetches are 32-byte aligned. It's also the easiest channel to offset 447 - * preloads for, to achieve staggered prefetches for multiple channels, because there are 448 - * always two STMs per prefetch, so there is always an opposite STM on which to put the 449 - * preload. Note, no need to BIC the base register here */ 450 - PF pld, [DST, #32*prefetch_distance - dst_alignment] 451 - .endif 452 - process_tail , 16, 0 453 - .if !((flags) & FLAG_PROCESS_DOES_STORE) 454 - pixst , 16, 0, DST 455 - .endif 456 - .set SUBBLOCK, SUBBLOCK+1 457 - .endr 458 - subs X, X, #pix_per_block 459 - bhs 110b 460 - .endm 461 - 462 - .macro wide_case_inner_loop_and_trailing_pixels process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask 463 - /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */ 464 - .if dst_r_bpp > 0 465 - tst DST, #16 466 - bne 111f 467 - process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 16 + DST_PRELOAD_BIAS 468 - b 112f 469 - 111: 470 - .endif 471 - process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 0 + DST_PRELOAD_BIAS 472 - 112: 473 - /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */ 474 - .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256) 475 - PF and, WK0, X, #pix_per_block-1 476 - .endif 477 - preload_trailing src_bpp, src_bpp_shift, SRC 478 - preload_trailing mask_bpp, mask_bpp_shift, MASK 479 - .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 480 - preload_trailing dst_r_bpp, dst_bpp_shift, DST 481 - .endif 482 - add X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp 483 - /* The remainder of the line is handled identically to the medium case */ 484 - medium_case_inner_loop_and_trailing_pixels process_head, process_tail,, exit_label, unaligned_src, unaligned_mask 485 - .endm 486 - 487 - .macro medium_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask 488 - 120: 489 - process_head , 16, 0, unaligned_src, unaligned_mask, 0 490 - process_tail , 16, 0 491 - .if !((flags) & FLAG_PROCESS_DOES_STORE) 492 - pixst , 16, 0, DST 493 - .endif 494 - subs X, X, #128/dst_w_bpp 495 - bhs 120b 496 - /* Trailing pixels */ 497 - tst X, #128/dst_w_bpp - 1 498 - beq exit_label 499 - trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask 500 - .endm 501 - 502 - .macro narrow_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask 503 - tst X, #16*8/dst_w_bpp 504 - conditional_process1 ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0 505 - /* Trailing pixels */ 506 - /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */ 507 - trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask 508 - .endm 509 - 510 - .macro switch_on_alignment action, process_head, process_tail, process_inner_loop, exit_label 511 - /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */ 512 - .if mask_bpp == 8 || mask_bpp == 16 513 - tst MASK, #3 514 - bne 141f 515 - .endif 516 - .if src_bpp == 8 || src_bpp == 16 517 - tst SRC, #3 518 - bne 140f 519 - .endif 520 - action process_head, process_tail, process_inner_loop, exit_label, 0, 0 521 - .if src_bpp == 8 || src_bpp == 16 522 - b exit_label 523 - 140: 524 - action process_head, process_tail, process_inner_loop, exit_label, 1, 0 525 - .endif 526 - .if mask_bpp == 8 || mask_bpp == 16 527 - b exit_label 528 - 141: 529 - .if src_bpp == 8 || src_bpp == 16 530 - tst SRC, #3 531 - bne 142f 532 - .endif 533 - action process_head, process_tail, process_inner_loop, exit_label, 0, 1 534 - .if src_bpp == 8 || src_bpp == 16 535 - b exit_label 536 - 142: 537 - action process_head, process_tail, process_inner_loop, exit_label, 1, 1 538 - .endif 539 - .endif 540 - .endm 541 - 542 - 543 - .macro end_of_line restore_x, vars_spilled, loop_label, last_one 544 - .if SINGLE_SCANLINE 545 - .ifc "last_one","" 546 - b 198f 547 - .endif 548 - .else 549 - .if vars_spilled 550 - /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */ 551 - /* This is ldmia sp,{} */ 552 - .word 0xE89D0000 | LINE_SAVED_REGS 553 - .endif 554 - subs Y, Y, #1 555 - .if vars_spilled 556 - .if (LINE_SAVED_REGS) & (1<<1) 557 - str Y, [sp] 558 - .endif 559 - .endif 560 - add DST, DST, STRIDE_D 561 - .if src_bpp > 0 562 - add SRC, SRC, STRIDE_S 563 - .endif 564 - .if mask_bpp > 0 565 - add MASK, MASK, STRIDE_M 566 - .endif 567 - .if restore_x 568 - mov X, ORIG_W 569 - .endif 570 - bhs loop_label 571 - .ifc "last_one","" 572 - .if vars_spilled 573 - b 197f 574 - .else 575 - b 198f 576 - .endif 577 - .else 578 - .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS) 579 - b 198f 580 - .endif 581 - .endif 582 - .endif 583 - .endm 584 - 585 - 586 - .macro generate_composite_function_common fname, \ 587 - src_bpp_, \ 588 - mask_bpp_, \ 589 - dst_w_bpp_, \ 590 - flags_, \ 591 - prefetch_distance_, \ 592 - init, \ 593 - newline, \ 594 - cleanup, \ 595 - process_head, \ 596 - process_tail, \ 597 - process_inner_loop 598 - 599 - pixman_asm_function fname 600 - 601 - /* 602 - * Make some macro arguments globally visible and accessible 603 - * from other macros 604 - */ 605 - .set src_bpp, src_bpp_ 606 - .set mask_bpp, mask_bpp_ 607 - .set dst_w_bpp, dst_w_bpp_ 608 - .set flags, flags_ 609 - .set prefetch_distance, prefetch_distance_ 610 - 611 - /* 612 - * Select prefetch type for this function. 613 - */ 614 - .if prefetch_distance == 0 615 - .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE 616 - .else 617 - .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD 618 - .endif 619 - 620 - .if src_bpp == 32 621 - .set src_bpp_shift, 2 622 - .elseif src_bpp == 24 623 - .set src_bpp_shift, 0 624 - .elseif src_bpp == 16 625 - .set src_bpp_shift, 1 626 - .elseif src_bpp == 8 627 - .set src_bpp_shift, 0 628 - .elseif src_bpp == 0 629 - .set src_bpp_shift, -1 630 - .else 631 - .error "requested src bpp (src_bpp) is not supported" 632 - .endif 633 - 634 - .if mask_bpp == 32 635 - .set mask_bpp_shift, 2 636 - .elseif mask_bpp == 24 637 - .set mask_bpp_shift, 0 638 - .elseif mask_bpp == 8 639 - .set mask_bpp_shift, 0 640 - .elseif mask_bpp == 0 641 - .set mask_bpp_shift, -1 642 - .else 643 - .error "requested mask bpp (mask_bpp) is not supported" 644 - .endif 645 - 646 - .if dst_w_bpp == 32 647 - .set dst_bpp_shift, 2 648 - .elseif dst_w_bpp == 24 649 - .set dst_bpp_shift, 0 650 - .elseif dst_w_bpp == 16 651 - .set dst_bpp_shift, 1 652 - .elseif dst_w_bpp == 8 653 - .set dst_bpp_shift, 0 654 - .else 655 - .error "requested dst bpp (dst_w_bpp) is not supported" 656 - .endif 657 - 658 - .if (((flags) & FLAG_DST_READWRITE) != 0) 659 - .set dst_r_bpp, dst_w_bpp 660 - .else 661 - .set dst_r_bpp, 0 662 - .endif 663 - 664 - .set pix_per_block, 16*8/dst_w_bpp 665 - .if src_bpp != 0 666 - .if 32*8/src_bpp > pix_per_block 667 - .set pix_per_block, 32*8/src_bpp 668 - .endif 669 - .endif 670 - .if mask_bpp != 0 671 - .if 32*8/mask_bpp > pix_per_block 672 - .set pix_per_block, 32*8/mask_bpp 673 - .endif 674 - .endif 675 - .if dst_r_bpp != 0 676 - .if 32*8/dst_r_bpp > pix_per_block 677 - .set pix_per_block, 32*8/dst_r_bpp 678 - .endif 679 - .endif 680 - 681 - /* The standard entry conditions set up by pixman-arm-common.h are: 682 - * r0 = width (pixels) 683 - * r1 = height (rows) 684 - * r2 = pointer to top-left pixel of destination 685 - * r3 = destination stride (pixels) 686 - * [sp] = source pixel value, or pointer to top-left pixel of source 687 - * [sp,#4] = 0 or source stride (pixels) 688 - * The following arguments are unused for non-mask operations 689 - * [sp,#8] = mask pixel value, or pointer to top-left pixel of mask 690 - * [sp,#12] = 0 or mask stride (pixels) 691 - * 692 - * or in the single-scanline case: 693 - * r0 = width (pixels) 694 - * r1 = pointer to top-left pixel of destination 695 - * r2 = pointer to top-left pixel of source 696 - * The following argument is unused for non-mask operations 697 - * r3 = pointer to top-left pixel of mask 698 - */ 699 - 700 - /* 701 - * Assign symbolic names to registers 702 - */ 703 - X .req r0 /* pixels to go on this line */ 704 - .if SINGLE_SCANLINE 705 - DST .req r1 /* destination pixel pointer */ 706 - SRC .req r2 /* source pixel pointer */ 707 - MASK .req r3 /* mask pixel pointer (if applicable) */ 708 - Y .req r4 /* temporary */ 709 - STRIDE_D .req r5 /* temporary */ 710 - STRIDE_S .req r6 /* temporary */ 711 - STRIDE_M .req r7 /* temporary */ 712 - .else 713 - Y .req r1 /* lines to go */ 714 - DST .req r2 /* destination pixel pointer */ 715 - STRIDE_D .req r3 /* destination stride (bytes, minus width) */ 716 - SRC .req r4 /* source pixel pointer */ 717 - STRIDE_S .req r5 /* source stride (bytes, minus width) */ 718 - MASK .req r6 /* mask pixel pointer (if applicable) */ 719 - STRIDE_M .req r7 /* mask stride (bytes, minus width) */ 720 - .endif 721 - WK0 .req r8 /* pixel data registers */ 722 - WK1 .req r9 723 - WK2 .req r10 724 - WK3 .req r11 725 - SCRATCH .req r12 726 - ORIG_W .req r14 /* width (pixels) */ 727 - 728 - push {r4-r11, lr} /* save all registers */ 729 - 730 - .if !SINGLE_SCANLINE 731 - subs Y, Y, #1 732 - blo 199f 733 - .endif 734 - 735 - #ifdef DEBUG_PARAMS 736 - sub sp, sp, #9*4 737 - #endif 738 - 739 - .if !SINGLE_SCANLINE 740 - .if src_bpp > 0 741 - ldr SRC, [sp, #ARGS_STACK_OFFSET] 742 - ldr STRIDE_S, [sp, #ARGS_STACK_OFFSET+4] 743 - .endif 744 - .if mask_bpp > 0 745 - ldr MASK, [sp, #ARGS_STACK_OFFSET+8] 746 - ldr STRIDE_M, [sp, #ARGS_STACK_OFFSET+12] 747 - .endif 748 - .endif 749 - 750 - #ifdef DEBUG_PARAMS 751 - add Y, Y, #1 752 - stmia sp, {r0-r7,pc} 753 - sub Y, Y, #1 754 - #endif 755 - 756 - init 757 - 758 - .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 759 - /* Reserve a word in which to store X during leading pixels */ 760 - sub sp, sp, #4 761 - .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+4 762 - .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET+4 763 - .endif 764 - 765 - .if !SINGLE_SCANLINE 766 - lsl STRIDE_D, #dst_bpp_shift /* stride in bytes */ 767 - sub STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift 768 - .if src_bpp > 0 769 - lsl STRIDE_S, #src_bpp_shift 770 - sub STRIDE_S, STRIDE_S, X, lsl #src_bpp_shift 771 - .endif 772 - .if mask_bpp > 0 773 - lsl STRIDE_M, #mask_bpp_shift 774 - sub STRIDE_M, STRIDE_M, X, lsl #mask_bpp_shift 775 - .endif 776 - .endif 777 - 778 - /* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */ 779 - cmp X, #2*16*8/dst_w_bpp - 1 780 - blo 170f 781 - .if src_bpp || mask_bpp || dst_r_bpp /* Wide and medium cases are the same for fill */ 782 - /* To preload ahead on the current line, we need at least (prefetch_distance+2) 32-byte blocks on all prefetch channels */ 783 - cmp X, #(prefetch_distance+3)*pix_per_block - 1 784 - blo 160f 785 - 786 - /* Wide case */ 787 - /* Adjust X so that the decrement instruction can also test for 788 - * inner loop termination. We want it to stop when there are 789 - * (prefetch_distance+1) complete blocks to go. */ 790 - sub X, X, #(prefetch_distance+2)*pix_per_block 791 - .if !SINGLE_SCANLINE 792 - mov ORIG_W, X 793 - .if (flags) & FLAG_SPILL_LINE_VARS_WIDE 794 - /* This is stmdb sp!,{} */ 795 - .word 0xE92D0000 | LINE_SAVED_REGS 796 - .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 797 - .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 798 - .endif 799 - .endif 800 - 151: /* New line */ 801 - newline 802 - preload_leading_step1 src_bpp, WK1, SRC 803 - preload_leading_step1 mask_bpp, WK2, MASK 804 - .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 805 - preload_leading_step1 dst_r_bpp, WK3, DST 806 - .endif 807 - 808 - ands WK0, DST, #15 809 - beq 154f 810 - rsb WK0, WK0, #16 /* number of leading bytes until destination aligned */ 811 - 812 - preload_leading_step2 src_bpp, src_bpp_shift, WK1, SRC 813 - preload_leading_step2 mask_bpp, mask_bpp_shift, WK2, MASK 814 - .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 815 - preload_leading_step2 dst_r_bpp, dst_bpp_shift, WK3, DST 816 - .endif 817 - 818 - leading_15bytes process_head, process_tail 819 - 820 - 154: /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */ 821 - .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) 822 - and SCRATCH, SRC, #31 823 - rsb SCRATCH, SCRATCH, #32*prefetch_distance 824 - .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) 825 - and SCRATCH, MASK, #31 826 - rsb SCRATCH, SCRATCH, #32*prefetch_distance 827 - .endif 828 - .ifc "process_inner_loop","" 829 - switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f 830 - .else 831 - switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f 832 - .endif 833 - 834 - 157: /* Check for another line */ 835 - end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b 836 - .if (!SINGLE_SCANLINE) && ((flags) & FLAG_SPILL_LINE_VARS_WIDE) 837 - .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 838 - .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 839 - .endif 840 - .endif 841 - 842 - .ltorg 843 - 844 - 160: /* Medium case */ 845 - .if !SINGLE_SCANLINE 846 - mov ORIG_W, X 847 - .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE 848 - /* This is stmdb sp!,{} */ 849 - .word 0xE92D0000 | LINE_SAVED_REGS 850 - .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 851 - .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 852 - .endif 853 - .endif 854 - 161: /* New line */ 855 - newline 856 - preload_line 0, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */ 857 - preload_line 0, mask_bpp, mask_bpp_shift, MASK 858 - .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 859 - preload_line 0, dst_r_bpp, dst_bpp_shift, DST 860 - .endif 861 - 862 - sub X, X, #128/dst_w_bpp /* simplifies inner loop termination */ 863 - ands WK0, DST, #15 864 - beq 164f 865 - rsb WK0, WK0, #16 /* number of leading bytes until destination aligned */ 866 - 867 - leading_15bytes process_head, process_tail 868 - 869 - 164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */ 870 - switch_on_alignment medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f 871 - 872 - 167: /* Check for another line */ 873 - end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b 874 - 875 - .ltorg 876 - 877 - 170: /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */ 878 - .if !SINGLE_SCANLINE 879 - .if dst_w_bpp < 32 880 - mov ORIG_W, X 881 - .endif 882 - .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE 883 - /* This is stmdb sp!,{} */ 884 - .word 0xE92D0000 | LINE_SAVED_REGS 885 - .endif 886 - .endif 887 - 171: /* New line */ 888 - newline 889 - preload_line 1, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */ 890 - preload_line 1, mask_bpp, mask_bpp_shift, MASK 891 - .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 892 - preload_line 1, dst_r_bpp, dst_bpp_shift, DST 893 - .endif 894 - 895 - .if dst_w_bpp == 8 896 - tst DST, #3 897 - beq 174f 898 - 172: subs X, X, #1 899 - blo 177f 900 - process_head , 1, 0, 1, 1, 0 901 - process_tail , 1, 0 902 - .if !((flags) & FLAG_PROCESS_DOES_STORE) 903 - pixst , 1, 0, DST 904 - .endif 905 - tst DST, #3 906 - bne 172b 907 - .elseif dst_w_bpp == 16 908 - tst DST, #2 909 - beq 174f 910 - subs X, X, #1 911 - blo 177f 912 - process_head , 2, 0, 1, 1, 0 913 - process_tail , 2, 0 914 - .if !((flags) & FLAG_PROCESS_DOES_STORE) 915 - pixst , 2, 0, DST 916 - .endif 917 - .endif 918 - 919 - 174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */ 920 - switch_on_alignment narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f 921 - 922 - 177: /* Check for another line */ 923 - end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one 924 - .if (!SINGLE_SCANLINE) && ((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE) 925 - .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 926 - .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 927 - .endif 928 - 929 - 197: 930 - .if (!SINGLE_SCANLINE) && ((flags) & FLAG_SPILL_LINE_VARS) 931 - add sp, sp, #LINE_SAVED_REG_COUNT*4 932 - .endif 933 - 198: 934 - .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 935 - .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-4 936 - .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET-4 937 - add sp, sp, #4 938 - .endif 939 - 940 - cleanup 941 - 942 - #ifdef DEBUG_PARAMS 943 - add sp, sp, #9*4 /* junk the debug copy of arguments */ 944 - #endif 945 - 199: 946 - pop {r4-r11, pc} /* exit */ 947 - 948 - .ltorg 949 - 950 - .unreq X 951 - .unreq Y 952 - .unreq DST 953 - .unreq STRIDE_D 954 - .unreq SRC 955 - .unreq STRIDE_S 956 - .unreq MASK 957 - .unreq STRIDE_M 958 - .unreq WK0 959 - .unreq WK1 960 - .unreq WK2 961 - .unreq WK3 962 - .unreq SCRATCH 963 - .unreq ORIG_W 964 - .endfunc 965 - .endm 966 - 967 - .macro generate_composite_function fname, \ 968 - src_bpp_, \ 969 - mask_bpp_, \ 970 - dst_w_bpp_, \ 971 - flags_, \ 972 - prefetch_distance_, \ 973 - init, \ 974 - newline, \ 975 - cleanup, \ 976 - process_head, \ 977 - process_tail, \ 978 - process_inner_loop 979 - .set SINGLE_SCANLINE, 0 980 - generate_composite_function_common \ 981 - fname, src_bpp_, mask_bpp_, dst_w_bpp_, flags_, prefetch_distance_, \ 982 - init, newline, cleanup, process_head, process_tail, process_inner_loop 983 - .endm 984 - 985 - .macro generate_composite_function_single_scanline fname, \ 986 - src_bpp_, \ 987 - mask_bpp_, \ 988 - dst_w_bpp_, \ 989 - flags_, \ 990 - prefetch_distance_, \ 991 - init, \ 992 - newline, \ 993 - cleanup, \ 994 - process_head, \ 995 - process_tail, \ 996 - process_inner_loop 997 - .set SINGLE_SCANLINE, 1 998 - generate_composite_function_common \ 999 - fname, src_bpp_, mask_bpp_, dst_w_bpp_, flags_, prefetch_distance_, \ 1000 - init, newline, cleanup, process_head, process_tail, process_inner_loop 1001 - .endm 1002 - 1003 - .macro line_saved_regs x:vararg 1004 - .set LINE_SAVED_REGS, 0 1005 - .set LINE_SAVED_REG_COUNT, 0 1006 - .irp SAVED_REG,x 1007 - .ifc "SAVED_REG","Y" 1008 - .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1) 1009 - .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 1010 - .endif 1011 - .ifc "SAVED_REG","STRIDE_D" 1012 - .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3) 1013 - .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 1014 - .endif 1015 - .ifc "SAVED_REG","STRIDE_S" 1016 - .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<5) 1017 - .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 1018 - .endif 1019 - .ifc "SAVED_REG","STRIDE_M" 1020 - .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<7) 1021 - .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 1022 - .endif 1023 - .ifc "SAVED_REG","ORIG_W" 1024 - .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<14) 1025 - .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 1026 - .endif 1027 - .endr 1028 - .if SINGLE_SCANLINE 1029 - .set LINE_SAVED_REG_COUNT, 0 1030 - .endif 1031 - .endm 1032 - 1033 - .macro nop_macro x:vararg 1034 - .endm

Configure Feed

Configure Feed