Library/PackageCache/com.unity.render-pipelines.core/Runtime/STP/Stp.hlsl at master

tacstudios.tngl.sh / AloneGame
fork
A game about forced loneliness, made by TACStudios
fork
AloneGame / Library / PackageCache / com.unity.render-pipelines.core / Runtime / STP / Stp.hlsl
at master 4558 lines 273 kB view raw
wrap content
TacWithGames Adding more items to the repo 8mo ago
fc86dd48
   1// This is necessary to prevent Unity from deciding that our default config logic is actually an include guard declaration
   2#ifndef STP_UNITY_INCLUDE_GUARD
   3#define STP_UNITY_INCLUDE_GUARD
   4////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
   5////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
   6////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
   7////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
   8////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
   9////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  10//_____________________________________________________________.._______________________________________________________________
  11//==============================================================================================================================
  12//
  13//
  14//                                                SPATIAL TEMPORAL POST [STP] v1.0
  15//
  16//
  17//==============================================================================================================================
  18////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  19////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  20//_____________________________________________________________.._______________________________________________________________
  21//==============================================================================================================================
  22// C/C++/GLSL/HLSL PORTABILITY BASED ON AMD's 'ffx_a.h'.
  23// INCLUDING ASSOCIATED LICENSE BELOW
  24//------------------------------------------------------------------------------------------------------------------------------
  25// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
  26// Permission is hereby granted, free of charge, to any person obtaining a copy
  27// of this software and associated documentation files(the "Software"), to deal
  28// in the Software without restriction, including without limitation the rights
  29// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
  30// copies of the Software, and to permit persons to whom the Software is
  31// furnished to do so, subject to the following conditions :
  32// The above copyright notice and this permission notice shall be included in
  33// all copies or substantial portions of the Software.
  34// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  35// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  36// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
  37// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  38// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  39// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  40// THE SOFTWARE.
  41//==============================================================================================================================
  42////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  43////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  44//_____________________________________________________________.._______________________________________________________________
  45//==============================================================================================================================
  46//                                                           NOTES
  47//------------------------------------------------------------------------------------------------------------------------------
  48// PLATFORM SPECIFIC WORKAROUNDS
  49// =============================
  50// - These all default to not enabled {0}, define to {1} to enable.
  51// - define STP_BUG_ALIAS16 1 .... Define to enable workaround for asuint16()/asfloat16().
  52// - define STP_BUG_PRX 1 ........ Define to disable approximate transendentals.
  53// - define STP_BUG_SAT_INF 1 .... Define to workaround platforms with broken 16-bit saturate +/- INF.
  54// - define STP_BUG_SAT 1 ........ Define to workaround compiler incorrectly factoring out inner saturate in 16-bit code.
  55//------------------------------------------------------------------------------------------------------------------------------
  56// CONFIGURATIONS
  57// ==============
  58// - INDEPENDENT OPTIONS
  59//    - define STP_32BIT  {0 := disable, 1 := compile the 32-bit version or implicit precision version}
  60//    - define STP_MEDIUM {0 := disable, 1 := enable the implicit medium precision version for 32-bit}
  61//    - define STP_16BIT  {0 := disable, 1 := compile the explicit 16-bit version}
  62//    -----
  63//    - define STP_GPU  {to include shader code}
  64//    - define STP_GLSL {to include the GLSL version of the code}
  65//    - define STP_HLSL {to include the HLSL version of the code}
  66//    -----
  67//    - define STP_DIL {to include the StpDil<H,F>() entry points}
  68//    - define STP_PAT {to include the StpPat<H,F>() entry points}
  69//    - define STP_SAA {to include the StpSaa<H,F>() entry points}
  70//    - define STP_TAA {to include the StpTaa<H,F>() entry points}
  71//    -----
  72//    - define STP_POSTMAP {running STP, 0 := before, 1 := after, application tonemapping}
  73//------------------------------------------------------------------------------------------------------------------------------
  74// IMPORTANT
  75// =========
  76// - All callbacks should explicitly sample from MIP level 0.
  77//    - Meaning if used in a pixel shader do not allow implicit LOD calculation.
  78// - The algorithm is tuned for pre-tonemap operation, post-tonemap wasn't tested yet.
  79//==============================================================================================================================
  80////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  81////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  82//_____________________________________________________________.._______________________________________________________________
  83//==============================================================================================================================
  84//                                                      EXTERNAL OPTIONS
  85//==============================================================================================================================
  86// Enable {1} or default disable any debug functionality {0}.
  87#ifndef STP_BUG
  88    #define STP_BUG 0
  89#endif
  90//------------------------------------------------------------------------------------------------------------------------------
  91// Define to test a pass-through dummy shader that fetches all resources but does no logic.
  92#ifndef STP_BUG_BW_SOL
  93    #define STP_BUG_BW_SOL 0
  94#endif
  95//------------------------------------------------------------------------------------------------------------------------------
  96// Define to {1} to use the max/min sampling permutation for color values.
  97#ifndef STP_MAX_MIN_10BIT
  98    #define STP_MAX_MIN_10BIT 0
  99#endif
 100//------------------------------------------------------------------------------------------------------------------------------
 101// Define to {1} to use the max/min sampling permutation for UINT32 values.
 102#ifndef STP_MAX_MIN_UINT
 103    #define STP_MAX_MIN_UINT 0
 104#endif
 105//------------------------------------------------------------------------------------------------------------------------------
 106// Define to {1} to use sampling with offsets.
 107#ifndef STP_OFFSETS
 108    #define STP_OFFSETS 0
 109#endif
 110//------------------------------------------------------------------------------------------------------------------------------
 111// STP is currently only tested to run pre-tonemap at that is what Unity is using.
 112// Run 0 := pre-tonemap, 1 := post-tonemap.
 113#ifndef STP_POSTMAP
 114    #define STP_POSTMAP 0
 115#endif
 116//------------------------------------------------------------------------------------------------------------------------------
 117// STP TAA quality level {0 to 1}
 118#ifndef STP_TAA_Q
 119    #define STP_TAA_Q 1
 120#endif
 121//==============================================================================================================================
 122// PLATFORM SPECIFIC BUG WORKAROUNDS
 123// =================================
 124// Define to {1} to disable usage of transendental approximations using float/int aliasing.
 125#ifndef STP_BUG_PRX
 126    #define STP_BUG_PRX 0
 127#endif
 128//------------------------------------------------------------------------------------------------------------------------------
 129// Define to {1} for workaround if platform cannot use saturate of +/- INF correctly.
 130#ifndef STP_BUG_SAT_INF
 131    #define STP_BUG_SAT_INF 0
 132#endif
 133//------------------------------------------------------------------------------------------------------------------------------
 134// Define to {1} for workaround for compilier incorrectly factoring out inner saturate in 16-bit code.
 135#ifndef STP_BUG_SAT
 136    #define STP_BUG_SAT 0
 137#endif
 138//------------------------------------------------------------------------------------------------------------------------------
 139// Define to {1} for workarounds for broken asuint16()/asfloat16().
 140#ifndef STP_BUG_ALIAS16
 141    #define STP_BUG_ALIAS16 0
 142    #undef STP_BUG_PRX
 143    #define STP_BUG_PRX 1
 144#endif
 145////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 146////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 147//_____________________________________________________________.._______________________________________________________________
 148//==============================================================================================================================
 149//                                                  C/C++/GLSL/HLSL PORTABILITY
 150//==============================================================================================================================
 151#if defined(STP_CPU)
 152    #ifndef STP_RESTRICT
 153        #define STP_RESTRICT __restrict
 154    #endif
 155//------------------------------------------------------------------------------------------------------------------------------
 156    #ifndef STP_STATIC
 157        #define STP_STATIC static
 158    #endif
 159//------------------------------------------------------------------------------------------------------------------------------
 160    typedef unsigned char StpB1;
 161    typedef unsigned short StpW1;
 162    typedef float StpF1;
 163    typedef uint32_t StpU1;
 164    #define StpF1_(a) ((StpF1)(a))
 165    #define StpU1_(a) ((StpU1)(a))
 166    STP_STATIC StpU1 StpU1_F1(StpF1 a) { union { StpF1 f; StpU1 u; } bits; bits.f = a; return bits.u; }
 167    #define StpOutF2 StpF1 *STP_RESTRICT
 168    #define StpExp2F1(x) exp2f(x)
 169    STP_STATIC StpF1 StpMaxF1(StpF1 a, StpF1 b) { return a > b ? a : b; }
 170//------------------------------------------------------------------------------------------------------------------------------
 171    // Convert float to half (in lower 16-bits of output).
 172    // Same fast technique as documented here: ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
 173    // Supports denormals.
 174    // Conversion rules are to make computations possibly "safer" on the GPU,
 175    //  -INF & -NaN -> -65504
 176    //  +INF & +NaN -> +65504
 177    STP_STATIC StpU1 StpU1_H1_F1(StpF1 f) {
 178        static StpW1 base[512] = {
 179            0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
 180            0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
 181            0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
 182            0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
 183            0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
 184            0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
 185            0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0001,0x0002,0x0004,0x0008,0x0010,0x0020,0x0040,0x0080,0x0100,
 186            0x0200,0x0400,0x0800,0x0c00,0x1000,0x1400,0x1800,0x1c00,0x2000,0x2400,0x2800,0x2c00,0x3000,0x3400,0x3800,0x3c00,
 187            0x4000,0x4400,0x4800,0x4c00,0x5000,0x5400,0x5800,0x5c00,0x6000,0x6400,0x6800,0x6c00,0x7000,0x7400,0x7800,0x7bff,
 188            0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
 189            0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
 190            0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
 191            0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
 192            0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
 193            0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
 194            0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
 195            0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
 196            0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
 197            0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
 198            0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
 199            0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
 200            0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
 201            0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8001,0x8002,0x8004,0x8008,0x8010,0x8020,0x8040,0x8080,0x8100,
 202            0x8200,0x8400,0x8800,0x8c00,0x9000,0x9400,0x9800,0x9c00,0xa000,0xa400,0xa800,0xac00,0xb000,0xb400,0xb800,0xbc00,
 203            0xc000,0xc400,0xc800,0xcc00,0xd000,0xd400,0xd800,0xdc00,0xe000,0xe400,0xe800,0xec00,0xf000,0xf400,0xf800,0xfbff,
 204            0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
 205            0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
 206            0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
 207            0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
 208            0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
 209            0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
 210            0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff };
 211        static StpB1 shift[512] = {
 212            0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 213            0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 214            0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 215            0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 216            0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 217            0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 218            0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f,
 219            0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,
 220            0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18,
 221            0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 222            0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 223            0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 224            0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 225            0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 226            0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 227            0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 228            0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 229            0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 230            0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 231            0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 232            0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 233            0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 234            0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f,
 235            0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,
 236            0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18,
 237            0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 238            0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 239            0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 240            0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 241            0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 242            0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
 243            0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18 };
 244        union { StpF1 f; StpU1 u; } bits;
 245        bits.f = f; StpU1 u = bits.u; StpU1 i = u >> 23;
 246        return (StpU1)(base[i]) + ((u & 0x7fffff) >> shift[i]); }
 247//------------------------------------------------------------------------------------------------------------------------------
 248    STP_STATIC StpU1 StpU1_H2_F2(StpInF2 a) { return StpU1_H1_F1(a[0]) + (StpU1_H1_F1(a[1]) << 16); }
 249#endif // defined(STP_CPU)
 250//==============================================================================================================================
 251#if defined(STP_GPU) && defined(STP_GLSL)
 252    #define StpP1 bool
 253    #define StpP2 bvec2
 254//------------------------------------------------------------------------------------------------------------------------------
 255    #define StpF1 float
 256    #define StpF2 vec2
 257    #define StpF3 vec3
 258    #define StpF4 vec4
 259//------------------------------------------------------------------------------------------------------------------------------
 260    #define StpI2 ivec2
 261//------------------------------------------------------------------------------------------------------------------------------
 262    #define StpU1 uint
 263    #define StpU2 uvec2
 264    #define StpU3 uvec3
 265    #define StpU4 uvec4
 266//------------------------------------------------------------------------------------------------------------------------------
 267    #define StpF1_U1(x) uintBitsToFloat(StpU1(x))
 268    #define StpF2_U2(x) uintBitsToFloat(StpU2(x))
 269    #define StpF3_U3(x) uintBitsToFloat(StpU3(x))
 270    #define StpF4_U4(x) uintBitsToFloat(StpU4(x))
 271    #define StpU1_F1(x) floatBitsToUint(StpF1(x))
 272    #define StpU2_F2(x) floatBitsToUint(StpF2(x))
 273    #define StpU3_F3(x) floatBitsToUint(StpF3(x))
 274    #define StpU4_F4(x) floatBitsToUint(StpF4(x))
 275//------------------------------------------------------------------------------------------------------------------------------
 276    #define StpU1_H2_F2 packHalf2x16
 277    #define StpF2_H2_U1 unpackHalf2x16
 278//------------------------------------------------------------------------------------------------------------------------------
 279    StpU1 StpBfeU1(StpU1 src, StpU1 off, StpU1 bits) { return bitfieldExtract(src, int(off), int(bits)); }
 280    // Proxy for V_BFI_B32 where the 'mask' is set as 'bits', 'mask=(1<<bits)-1', and 'bits' needs to be an immediate.
 281    StpU1 StpBfiMskU1(StpU1 src, StpU1 ins, StpU1 bits) { return bitfieldInsert(src, ins, 0, int(bits)); }
 282#endif // defined(STP_GPU) && defined(STP_GLSL)
 283//==============================================================================================================================
 284#if defined(STP_GPU) && defined(STP_GLSL) && defined(STP_16BIT)
 285    #define StpH1 float16_t
 286    #define StpH2 f16vec2
 287    #define StpH3 f16vec3
 288    #define StpH4 f16vec4
 289//------------------------------------------------------------------------------------------------------------------------------
 290    #define StpW1 uint16_t
 291    #define StpW2 u16vec2
 292    #define StpW3 u16vec3
 293    #define StpW4 u16vec4
 294//------------------------------------------------------------------------------------------------------------------------------
 295    #define StpW2_U1(x) unpackUint2x16(StpU1(x))
 296    #define StpH2_U1(x) unpackFloat2x16(StpU1(x))
 297//------------------------------------------------------------------------------------------------------------------------------
 298    #define StpW1_H1(x) halfBitsToUint16(StpH1(x))
 299    #define StpW2_H2(x) halfBitsToUint16(StpH2(x))
 300    #define StpW3_H3(x) halfBitsToUint16(StpH3(x))
 301    #define StpW4_H4(x) halfBitsToUint16(StpH4(x))
 302//------------------------------------------------------------------------------------------------------------------------------
 303    #define StpH1_W1(x) uint16BitsToHalf(StpW1(x))
 304    #define StpH2_W2(x) uint16BitsToHalf(StpW2(x))
 305    #define StpH3_W3(x) uint16BitsToHalf(StpW3(x))
 306    #define StpH4_W4(x) uint16BitsToHalf(StpW4(x))
 307//------------------------------------------------------------------------------------------------------------------------------
 308    #define StpU1_H2(x) packFloat2x16(StpH2(x))
 309#endif // defined(STP_GPU) && defined(STP_GLSL) && defined(STP_16BIT)
 310//==============================================================================================================================
 311#if defined(STP_GPU) && defined(STP_HLSL)
 312    #define StpP1 bool
 313    #define StpP2 bool2
 314//------------------------------------------------------------------------------------------------------------------------------
 315    #define StpF1 float
 316    #define StpF2 float2
 317    #define StpF3 float3
 318    #define StpF4 float4
 319//------------------------------------------------------------------------------------------------------------------------------
 320    #define StpI2 int2
 321//------------------------------------------------------------------------------------------------------------------------------
 322    #define StpU1 uint
 323    #define StpU2 uint2
 324    #define StpU3 uint3
 325    #define StpU4 uint4
 326//------------------------------------------------------------------------------------------------------------------------------
 327    #define StpF1_U1(x) asfloat(StpU1(x))
 328    #define StpF2_U2(x) asfloat(StpU2(x))
 329    #define StpF3_U3(x) asfloat(StpU3(x))
 330    #define StpF4_U4(x) asfloat(StpU4(x))
 331    #define StpU1_F1(x) asuint(StpF1(x))
 332    #define StpU2_F2(x) asuint(StpF2(x))
 333    #define StpU3_F3(x) asuint(StpF3(x))
 334    #define StpU4_F4(x) asuint(StpF4(x))
 335//------------------------------------------------------------------------------------------------------------------------------
 336    StpU1 StpU1_H2_F2_x(StpF2 a) { return f32tof16(a.x) | (f32tof16(a.y) << 16); }
 337    #define StpU1_H2_F2(a) StpU1_H2_F2_x(StpF2(a))
 338//------------------------------------------------------------------------------------------------------------------------------
 339    StpF2 StpF2_H2_U1_x(StpU1 x) { return StpF2(f16tof32(x & 0xFFFF), f16tof32(x >> 16)); }
 340    #define StpF2_H2_U1(x) StpF2_H2_U1_x(StpU1(x))
 341//------------------------------------------------------------------------------------------------------------------------------
 342    StpU1 StpBfeU1(StpU1 src, StpU1 off, StpU1 bits) { StpU1 msk = (1u << bits) - 1; return (src >> off) & msk; }
 343    StpU1 StpBfiMskU1(StpU1 src, StpU1 ins, StpU1 bits) { StpU1 msk = (1u << bits) - 1; return (ins & msk) | (src & (~msk)); }
 344#endif // defined(STP_GPU) && defined(STP_HLSL)
 345//==============================================================================================================================
 346#if defined(STP_GPU) && defined(STP_HLSL) && defined(STP_MEDIUM)
 347    #define StpMU1 min16uint
 348    #define StpMU2 min16uint2
 349    #define StpMU3 min16uint3
 350    #define StpMU4 min16uint4
 351//------------------------------------------------------------------------------------------------------------------------------
 352    #define StpMF1 min16float
 353    #define StpMF2 min16float2
 354    #define StpMF3 min16float3
 355    #define StpMF4 min16float4
 356#endif // defined(STP_GPU) && defined(STP_HLSL) && defined(STP_MEDIUM)
 357//==============================================================================================================================
 358#if defined(STP_GPU) && (!defined(STP_MEDIUM))
 359    #define StpMU1 StpU1
 360    #define StpMU2 StpU2
 361    #define StpMU3 StpU3
 362    #define StpMU4 StpU4
 363//------------------------------------------------------------------------------------------------------------------------------
 364    #define StpMF1 StpF1
 365    #define StpMF2 StpF2
 366    #define StpMF3 StpF3
 367    #define StpMF4 StpF4
 368#endif // defined(STP_GPU) && (!defined(STP_MEDIUM))
 369//==============================================================================================================================
 370#if defined(STP_GPU) && defined(STP_HLSL) && defined(STP_16BIT)
 371    #define StpH1 float16_t
 372    #define StpH2 float16_t2
 373    #define StpH3 float16_t3
 374    #define StpH4 float16_t4
 375//------------------------------------------------------------------------------------------------------------------------------
 376    #define StpW1 uint16_t
 377    #define StpW2 uint16_t2
 378    #define StpW3 uint16_t3
 379    #define StpW4 uint16_t4
 380//------------------------------------------------------------------------------------------------------------------------------
 381    StpW2 StpW2_U1_x(StpU1 x) { StpU2 t = StpU2(x & 0xFFFF, x >> 16); return StpW2(t); }
 382    #define StpW2_U1(x) StpW2_U1_x(StpU1(x))
 383    StpH2 StpH2_U1_x(StpU1 x) { return asfloat16(StpW2((StpW1)(x & 0xFFFF), (StpW1)(x >> 16))); }
 384    #define StpH2_U1(x) StpH2_U1_x(StpU1(x))
 385//------------------------------------------------------------------------------------------------------------------------------
 386    #define StpW1_H1(x) asuint16(StpH1(x))
 387    #define StpW2_H2(x) asuint16(StpH2(x))
 388    #define StpW3_H3(x) asuint16(StpH3(x))
 389    #define StpW4_H4(x) asuint16(StpH4(x))
 390//------------------------------------------------------------------------------------------------------------------------------
 391    #define StpH1_W1(x) asfloat16(StpW1(x))
 392    #define StpH2_W2(x) asfloat16(StpW2(x))
 393    #define StpH3_W3(x) asfloat16(StpW3(x))
 394    #define StpH4_W4(x) asfloat16(StpW4(x))
 395//------------------------------------------------------------------------------------------------------------------------------
 396    StpU1 StpU1_H2_x(StpH2 x) { StpW2 t = asuint16(x); return (((StpU1)t.x) | (((StpU1)t.y) << 16)); }
 397    #define StpU1_H2(x) StpU1_H2_x(StpH2(x))
 398#endif // defined(STP_GPU) && defined(STP_HLSL) && defined(STP_16BIT)
 399//==============================================================================================================================
 400#if defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL))
 401    StpF1 StpMaxF1(StpF1 a, StpF1 b) { return max(a, b); }
 402//------------------------------------------------------------------------------------------------------------------------------
 403    StpP2 StpP2_x(StpP1 x) { return StpP2(x, x); }
 404    #define StpP2_(x) StpP2_x(StpP1(x))
 405//------------------------------------------------------------------------------------------------------------------------------
 406    StpF1 StpF1_x(StpF1 x) { return StpF1(x); }
 407    StpF2 StpF2_x(StpF1 x) { return StpF2(x, x); }
 408    StpF3 StpF3_x(StpF1 x) { return StpF3(x, x, x); }
 409    StpF4 StpF4_x(StpF1 x) { return StpF4(x, x, x, x); }
 410    #define StpF1_(x) StpF1_x(StpF1(x))
 411    #define StpF2_(x) StpF2_x(StpF1(x))
 412    #define StpF3_(x) StpF3_x(StpF1(x))
 413    #define StpF4_(x) StpF4_x(StpF1(x))
 414//------------------------------------------------------------------------------------------------------------------------------
 415    StpMF1 StpMF1_x(StpMF1 x) { return StpMF1(x); }
 416    StpMF2 StpMF2_x(StpMF1 x) { return StpMF2(x, x); }
 417    StpMF3 StpMF3_x(StpMF1 x) { return StpMF3(x, x, x); }
 418    StpMF4 StpMF4_x(StpMF1 x) { return StpMF4(x, x, x, x); }
 419    #define StpMF1_(x) StpMF1_x(StpMF1(x))
 420    #define StpMF2_(x) StpMF2_x(StpMF1(x))
 421    #define StpMF3_(x) StpMF3_x(StpMF1(x))
 422    #define StpMF4_(x) StpMF4_x(StpMF1(x))
 423//------------------------------------------------------------------------------------------------------------------------------
 424    StpMU1 StpMU1_x(StpMU1 x) { return StpMU1(x); }
 425    StpMU2 StpMU2_x(StpMU1 x) { return StpMU2(x, x); }
 426    StpMU3 StpMU3_x(StpMU1 x) { return StpMU3(x, x, x); }
 427    StpMU4 StpMU4_x(StpMU1 x) { return StpMU4(x, x, x, x); }
 428    #define StpMU1_(x) StpMU1_x(StpMU1(x))
 429    #define StpMU2_(x) StpMU2_x(StpMU1(x))
 430    #define StpMU3_(x) StpMU3_x(StpMU1(x))
 431    #define StpMU4_(x) StpMU4_x(StpMU1(x))
 432//------------------------------------------------------------------------------------------------------------------------------
 433    StpU1 StpU1_x(StpU1 x) { return StpU1(x); }
 434    StpU2 StpU2_x(StpU1 x) { return StpU2(x, x); }
 435    StpU3 StpU3_x(StpU1 x) { return StpU3(x, x, x); }
 436    StpU4 StpU4_x(StpU1 x) { return StpU4(x, x, x, x); }
 437    #define StpU1_(x) StpU1_x(StpU1(x))
 438    #define StpU2_(x) StpU2_x(StpU1(x))
 439    #define StpU3_(x) StpU3_x(StpU1(x))
 440    #define StpU4_(x) StpU4_x(StpU1(x))
 441//------------------------------------------------------------------------------------------------------------------------------
 442    #if 0
 443        // Slow implementation (if not pattern matched by a compiler).
 444        StpF1 StpCpySgnF1(StpF1 d, StpF1 s) { return StpF1_U1(StpU1_F1(d) | (StpU1_F1(s) & StpU1_(0x80000000u))); }
 445        StpF2 StpCpySgnF2(StpF2 d, StpF2 s) { return StpF2_U2(StpU2_F2(d) | (StpU2_F2(s) & StpU2_(0x80000000u))); }
 446        StpF3 StpCpySgnF3(StpF3 d, StpF3 s) { return StpF3_U3(StpU3_F3(d) | (StpU3_F3(s) & StpU3_(0x80000000u))); }
 447        StpF4 StpCpySgnF4(StpF4 d, StpF4 s) { return StpF4_U4(StpU4_F4(d) | (StpU4_F4(s) & StpU4_(0x80000000u))); }
 448    #else
 449        // Faster implementation (one portable BFI).
 450        StpF1 StpCpySgnF1(StpF1 d, StpF1 s) { return StpF1_U1(StpBfiMskU1(StpU1_F1(s), StpU1_F1(d), StpU1_(31))); }
 451        StpF2 StpCpySgnF2(StpF2 d, StpF2 s) { return StpF2(StpCpySgnF1(d.x, s.x), StpCpySgnF1(d.y, s.y)); }
 452        StpF3 StpCpySgnF3(StpF3 d, StpF3 s) {
 453            return StpF3(StpCpySgnF1(d.x, s.x), StpCpySgnF1(d.y, s.y), StpCpySgnF1(d.z, s.z)); }
 454        StpF4 StpCpySgnF4(StpF4 d, StpF4 s) {
 455            return StpF4(StpCpySgnF1(d.x, s.x), StpCpySgnF1(d.y, s.y), StpCpySgnF1(d.z, s.z), StpCpySgnF1(d.w, s.w)); }
 456    #endif
 457    StpF1 StpMax3F1(StpF1 x, StpF1 y, StpF1 z) { return max(x, max(y, z)); }
 458    StpF2 StpMax3F2(StpF2 x, StpF2 y, StpF2 z) { return max(x, max(y, z)); }
 459    StpF3 StpMax3F3(StpF3 x, StpF3 y, StpF3 z) { return max(x, max(y, z)); }
 460    StpF4 StpMax3F4(StpF4 x, StpF4 y, StpF4 z) { return max(x, max(y, z)); }
 461    StpF1 StpMin3F1(StpF1 x, StpF1 y, StpF1 z) { return min(x, min(y, z)); }
 462    StpF2 StpMin3F2(StpF2 x, StpF2 y, StpF2 z) { return min(x, min(y, z)); }
 463    StpF3 StpMin3F3(StpF3 x, StpF3 y, StpF3 z) { return min(x, min(y, z)); }
 464    StpF4 StpMin3F4(StpF4 x, StpF4 y, StpF4 z) { return min(x, min(y, z)); }
 465    StpU1 StpMax3U1(StpU1 x, StpU1 y, StpU1 z) { return max(x, max(y, z)); }
 466    StpU1 StpMin3U1(StpU1 x, StpU1 y, StpU1 z) { return min(x, min(y, z)); }
 467    StpU4 StpMin3U4(StpU4 x, StpU4 y, StpU4 z) { return min(x, min(y, z)); }
 468//------------------------------------------------------------------------------------------------------------------------------
 469    StpMF1 StpMax3MF1(StpMF1 x, StpMF1 y, StpMF1 z) { return max(x, max(y, z)); }
 470    StpMF2 StpMax3MF2(StpMF2 x, StpMF2 y, StpMF2 z) { return max(x, max(y, z)); }
 471    StpMF3 StpMax3MF3(StpMF3 x, StpMF3 y, StpMF3 z) { return max(x, max(y, z)); }
 472    StpMF4 StpMax3MF4(StpMF4 x, StpMF4 y, StpMF4 z) { return max(x, max(y, z)); }
 473    StpMF1 StpMin3MF1(StpMF1 x, StpMF1 y, StpMF1 z) { return min(x, min(y, z)); }
 474    StpMF2 StpMin3MF2(StpMF2 x, StpMF2 y, StpMF2 z) { return min(x, min(y, z)); }
 475    StpMF3 StpMin3MF3(StpMF3 x, StpMF3 y, StpMF3 z) { return min(x, min(y, z)); }
 476    StpMF4 StpMin3MF4(StpMF4 x, StpMF4 y, StpMF4 z) { return min(x, min(y, z)); }
 477//------------------------------------------------------------------------------------------------------------------------------
 478    // Make {<+0 := -1.0, >=+0 := 1.0}.
 479    StpF1 StpSgnOneF1(StpF1 x) { return StpF1_U1(StpBfiMskU1(StpU1_F1(x), StpU1_(0x3f800000), StpU1_(31))); }
 480#endif // defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL))
 481//==============================================================================================================================
 482#if defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL)) && defined(STP_16BIT)
 483    StpH1 StpH1_x(StpH1 x) { return StpH1(x); }
 484    StpH2 StpH2_x(StpH1 x) { return StpH2(x, x); }
 485    StpH3 StpH3_x(StpH1 x) { return StpH3(x, x, x); }
 486    StpH4 StpH4_x(StpH1 x) { return StpH4(x, x, x, x); }
 487    #define StpH1_(x) StpH1_x(StpH1(x))
 488    #define StpH2_(x) StpH2_x(StpH1(x))
 489    #define StpH3_(x) StpH3_x(StpH1(x))
 490    #define StpH4_(x) StpH4_x(StpH1(x))
 491//------------------------------------------------------------------------------------------------------------------------------
 492    StpW1 StpW1_x(StpW1 x) { return StpW1(x); }
 493    StpW2 StpW2_x(StpW1 x) { return StpW2(x, x); }
 494    StpW3 StpW3_x(StpW1 x) { return StpW3(x, x, x); }
 495    StpW4 StpW4_x(StpW1 x) { return StpW4(x, x, x, x); }
 496    #define StpW1_(x) StpW1_x(StpW1(x))
 497    #define StpW2_(x) StpW2_x(StpW1(x))
 498    #define StpW3_(x) StpW3_x(StpW1(x))
 499    #define StpW4_(x) StpW4_x(StpW1(x))
 500//------------------------------------------------------------------------------------------------------------------------------
 501    StpH1 StpMax3H1(StpH1 x, StpH1 y, StpH1 z) { return max(x, max(y, z)); }
 502    StpH2 StpMax3H2(StpH2 x, StpH2 y, StpH2 z) { return max(x, max(y, z)); }
 503    StpH3 StpMax3H3(StpH3 x, StpH3 y, StpH3 z) { return max(x, max(y, z)); }
 504    StpH4 StpMax3H4(StpH4 x, StpH4 y, StpH4 z) { return max(x, max(y, z)); }
 505    StpH1 StpMin3H1(StpH1 x, StpH1 y, StpH1 z) { return min(x, min(y, z)); }
 506    StpH2 StpMin3H2(StpH2 x, StpH2 y, StpH2 z) { return min(x, min(y, z)); }
 507    StpH3 StpMin3H3(StpH3 x, StpH3 y, StpH3 z) { return min(x, min(y, z)); }
 508    StpH4 StpMin3H4(StpH4 x, StpH4 y, StpH4 z) { return min(x, min(y, z)); }
 509    StpW1 StpMax3W1(StpW1 x, StpW1 y, StpW1 z) { return max(x, max(y, z)); }
 510    StpW1 StpMin3W1(StpW1 x, StpW1 y, StpW1 z) { return min(x, min(y, z)); }
 511#endif // defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL)) && defined(STP_16BIT)
 512//==============================================================================================================================
 513#if defined(STP_GPU) && defined(STP_GLSL)
 514    StpF1 StpFractF1(StpF1 x) { return fract(x); }
 515    StpF2 StpFractF2(StpF2 x) { return fract(x); }
 516    StpF3 StpFractF3(StpF3 x) { return fract(x); }
 517    StpF4 StpFractF4(StpF4 x) { return fract(x); }
 518    StpF1 StpLerpF1(StpF1 x, StpF1 y, StpF1 z) { return mix(x, y, z); }
 519    StpF2 StpLerpF2(StpF2 x, StpF2 y, StpF2 z) { return mix(x, y, z); }
 520    StpF3 StpLerpF3(StpF3 x, StpF3 y, StpF3 z) { return mix(x, y, z); }
 521    StpF4 StpLerpF4(StpF4 x, StpF4 y, StpF4 z) { return mix(x, y, z); }
 522    StpF1 StpRcpF1(StpF1 x) { return StpF1_(1.0) / x; }
 523    StpF2 StpRcpF2(StpF2 x) { return StpF2_(1.0) / x; }
 524    StpF3 StpRcpF3(StpF3 x) { return StpF3_(1.0) / x; }
 525    StpF4 StpRcpF4(StpF4 x) { return StpF4_(1.0) / x; }
 526    StpF1 StpRsqF1(StpF1 x) { return inversesqrt(x); }
 527    StpF2 StpRsqF2(StpF2 x) { return inversesqrt(x); }
 528    StpF3 StpRsqF3(StpF3 x) { return inversesqrt(x); }
 529    StpF4 StpRsqF4(StpF4 x) { return inversesqrt(x); }
 530    StpF1 StpSatF1(StpF1 x) { return clamp(x, StpF1_(0.0), StpF1_(1.0)); }
 531    StpF2 StpSatF2(StpF2 x) { return clamp(x, StpF2_(0.0), StpF2_(1.0)); }
 532    StpF3 StpSatF3(StpF3 x) { return clamp(x, StpF3_(0.0), StpF3_(1.0)); }
 533    StpF4 StpSatF4(StpF4 x) { return clamp(x, StpF4_(0.0), StpF4_(1.0)); }
 534//------------------------------------------------------------------------------------------------------------------------------
 535    StpMF1 StpFractMF1(StpMF1 x) { return fract(x); }
 536    StpMF2 StpFractMF2(StpMF2 x) { return fract(x); }
 537    StpMF3 StpFractMF3(StpMF3 x) { return fract(x); }
 538    StpMF4 StpFractMF4(StpMF4 x) { return fract(x); }
 539    StpMF1 StpLerpMF1(StpMF1 x, StpMF1 y, StpMF1 z) { return mix(x, y, z); }
 540    StpMF2 StpLerpMF2(StpMF2 x, StpMF2 y, StpMF2 z) { return mix(x, y, z); }
 541    StpMF3 StpLerpMF3(StpMF3 x, StpMF3 y, StpMF3 z) { return mix(x, y, z); }
 542    StpMF4 StpLerpMF4(StpMF4 x, StpMF4 y, StpMF4 z) { return mix(x, y, z); }
 543    StpMF1 StpRcpMF1(StpMF1 x) { return StpMF1_(1.0) / x; }
 544    StpMF2 StpRcpMF2(StpMF2 x) { return StpMF2_(1.0) / x; }
 545    StpMF3 StpRcpMF3(StpMF3 x) { return StpMF3_(1.0) / x; }
 546    StpMF4 StpRcpMF4(StpMF4 x) { return StpMF4_(1.0) / x; }
 547    StpMF1 StpRsqMF1(StpMF1 x) { return inversesqrt(x); }
 548    StpMF2 StpRsqMF2(StpMF2 x) { return inversesqrt(x); }
 549    StpMF3 StpRsqMF3(StpMF3 x) { return inversesqrt(x); }
 550    StpMF4 StpRsqMF4(StpMF4 x) { return inversesqrt(x); }
 551    StpMF1 StpSatMF1(StpMF1 x) { return clamp(x, StpMF1_(0.0), StpMF1_(1.0)); }
 552    StpMF2 StpSatMF2(StpMF2 x) { return clamp(x, StpMF2_(0.0), StpMF2_(1.0)); }
 553    StpMF3 StpSatMF3(StpMF3 x) { return clamp(x, StpMF3_(0.0), StpMF3_(1.0)); }
 554    StpMF4 StpSatMF4(StpMF4 x) { return clamp(x, StpMF4_(0.0), StpMF4_(1.0)); }
 555#endif // defined(STP_GPU) && defined(STP_GLSL)
 556//==============================================================================================================================
 557#if defined(STP_GPU) && defined(STP_GLSL) && defined(STP_16BIT)
 558    StpH1 StpFractH1(StpH1 x) { return fract(x); }
 559    StpH2 StpFractH2(StpH2 x) { return fract(x); }
 560    StpH3 StpFractH3(StpH3 x) { return fract(x); }
 561    StpH4 StpFractH4(StpH4 x) { return fract(x); }
 562    StpH1 StpLerpH1(StpH1 x, StpH1 y, StpH1 z) { return mix(x, y, z); }
 563    StpH2 StpLerpH2(StpH2 x, StpH2 y, StpH2 z) { return mix(x, y, z); }
 564    StpH3 StpLerpH3(StpH3 x, StpH3 y, StpH3 z) { return mix(x, y, z); }
 565    StpH4 StpLerpH4(StpH4 x, StpH4 y, StpH4 z) { return mix(x, y, z); }
 566    StpH1 StpRcpH1(StpH1 x) { return StpH1_(1.0) / x; }
 567    StpH2 StpRcpH2(StpH2 x) { return StpH2_(1.0) / x; }
 568    StpH3 StpRcpH3(StpH3 x) { return StpH3_(1.0) / x; }
 569    StpH4 StpRcpH4(StpH4 x) { return StpH4_(1.0) / x; }
 570    StpH1 StpRsqH1(StpH1 x) { return inversesqrt(x); }
 571    StpH2 StpRsqH2(StpH2 x) { return inversesqrt(x); }
 572    StpH3 StpRsqH3(StpH3 x) { return inversesqrt(x); }
 573    StpH4 StpRsqH4(StpH4 x) { return inversesqrt(x); }
 574    StpH1 StpSatH1(StpH1 x) { return clamp(x, StpH1_(0.0), StpH1_(1.0)); }
 575    StpH2 StpSatH2(StpH2 x) { return clamp(x, StpH2_(0.0), StpH2_(1.0)); }
 576    StpH3 StpSatH3(StpH3 x) { return clamp(x, StpH3_(0.0), StpH3_(1.0)); }
 577    StpH4 StpSatH4(StpH4 x) { return clamp(x, StpH4_(0.0), StpH4_(1.0)); }
 578#endif // defined(STP_GPU) && defined(STP_GLSL) && defined(STP_16BIT)
 579//==============================================================================================================================
 580#if defined(STP_GPU) && defined(STP_HLSL)
 581    StpF1 StpFractF1(StpF1 x) { return x - floor(x); }
 582    StpF2 StpFractF2(StpF2 x) { return x - floor(x); }
 583    StpF3 StpFractF3(StpF3 x) { return x - floor(x); }
 584    StpF4 StpFractF4(StpF4 x) { return x - floor(x); }
 585    StpF1 StpLerpF1(StpF1 x, StpF1 y, StpF1 z) { return lerp(x, y, z); }
 586    StpF2 StpLerpF2(StpF2 x, StpF2 y, StpF2 z) { return lerp(x, y, z); }
 587    StpF3 StpLerpF3(StpF3 x, StpF3 y, StpF3 z) { return lerp(x, y, z); }
 588    StpF4 StpLerpF4(StpF4 x, StpF4 y, StpF4 z) { return lerp(x, y, z); }
 589    StpF1 StpRcpF1(StpF1 x) { return rcp(x); }
 590    StpF2 StpRcpF2(StpF2 x) { return rcp(x); }
 591    StpF3 StpRcpF3(StpF3 x) { return rcp(x); }
 592    StpF4 StpRcpF4(StpF4 x) { return rcp(x); }
 593    StpF1 StpRsqF1(StpF1 x) { return rsqrt(x); }
 594    StpF2 StpRsqF2(StpF2 x) { return rsqrt(x); }
 595    StpF3 StpRsqF3(StpF3 x) { return rsqrt(x); }
 596    StpF4 StpRsqF4(StpF4 x) { return rsqrt(x); }
 597    StpF1 StpSatF1(StpF1 x) { return saturate(x); }
 598    StpF2 StpSatF2(StpF2 x) { return saturate(x); }
 599    StpF3 StpSatF3(StpF3 x) { return saturate(x); }
 600    StpF4 StpSatF4(StpF4 x) { return saturate(x); }
 601//------------------------------------------------------------------------------------------------------------------------------
 602    StpMF1 StpFractMF1(StpMF1 x) { return x - floor(x); }
 603    StpMF2 StpFractMF2(StpMF2 x) { return x - floor(x); }
 604    StpMF3 StpFractMF3(StpMF3 x) { return x - floor(x); }
 605    StpMF4 StpFractMF4(StpMF4 x) { return x - floor(x); }
 606    StpMF1 StpLerpMF1(StpMF1 x, StpMF1 y, StpMF1 z) { return lerp(x, y, z); }
 607    StpMF2 StpLerpMF2(StpMF2 x, StpMF2 y, StpMF2 z) { return lerp(x, y, z); }
 608    StpMF3 StpLerpMF3(StpMF3 x, StpMF3 y, StpMF3 z) { return lerp(x, y, z); }
 609    StpMF4 StpLerpMF4(StpMF4 x, StpMF4 y, StpMF4 z) { return lerp(x, y, z); }
 610    StpMF1 StpRcpMF1(StpMF1 x) { return rcp(x); }
 611    StpMF2 StpRcpMF2(StpMF2 x) { return rcp(x); }
 612    StpMF3 StpRcpMF3(StpMF3 x) { return rcp(x); }
 613    StpMF4 StpRcpMF4(StpMF4 x) { return rcp(x); }
 614    StpMF1 StpRsqMF1(StpMF1 x) { return rsqrt(x); }
 615    StpMF2 StpRsqMF2(StpMF2 x) { return rsqrt(x); }
 616    StpMF3 StpRsqMF3(StpMF3 x) { return rsqrt(x); }
 617    StpMF4 StpRsqMF4(StpMF4 x) { return rsqrt(x); }
 618    StpMF1 StpSatMF1(StpMF1 x) { return saturate(x); }
 619    StpMF2 StpSatMF2(StpMF2 x) { return saturate(x); }
 620    StpMF3 StpSatMF3(StpMF3 x) { return saturate(x); }
 621    StpMF4 StpSatMF4(StpMF4 x) { return saturate(x); }
 622#endif // defined(STP_GPU) && defined(STP_HLSL)
 623//==============================================================================================================================
 624#if defined(STP_GPU) && defined(STP_HLSL) && defined(STP_16BIT)
 625    StpH1 StpFractH1(StpH1 x) { return x - floor(x); }
 626    StpH2 StpFractH2(StpH2 x) { return x - floor(x); }
 627    StpH3 StpFractH3(StpH3 x) { return x - floor(x); }
 628    StpH4 StpFractH4(StpH4 x) { return x - floor(x); }
 629    StpH1 StpLerpH1(StpH1 x, StpH1 y, StpH1 z) { return lerp(x, y, z); }
 630    StpH2 StpLerpH2(StpH2 x, StpH2 y, StpH2 z) { return lerp(x, y, z); }
 631    StpH3 StpLerpH3(StpH3 x, StpH3 y, StpH3 z) { return lerp(x, y, z); }
 632    StpH4 StpLerpH4(StpH4 x, StpH4 y, StpH4 z) { return lerp(x, y, z); }
 633    StpH1 StpRcpH1(StpH1 x) { return rcp(x); }
 634    StpH2 StpRcpH2(StpH2 x) { return rcp(x); }
 635    StpH3 StpRcpH3(StpH3 x) { return rcp(x); }
 636    StpH4 StpRcpH4(StpH4 x) { return rcp(x); }
 637    StpH1 StpRsqH1(StpH1 x) { return rsqrt(x); }
 638    StpH2 StpRsqH2(StpH2 x) { return rsqrt(x); }
 639    StpH3 StpRsqH3(StpH3 x) { return rsqrt(x); }
 640    StpH4 StpRsqH4(StpH4 x) { return rsqrt(x); }
 641    StpH1 StpSatH1(StpH1 x) { return saturate(x); }
 642    StpH2 StpSatH2(StpH2 x) { return saturate(x); }
 643    StpH3 StpSatH3(StpH3 x) { return saturate(x); }
 644    StpH4 StpSatH4(StpH4 x) { return saturate(x); }
 645#endif // defined(STP_GPU) && defined(STP_HLSL) && defined(STP_16BIT)
 646//==============================================================================================================================
 647#if defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL))
 648    StpF1 StpExp2F1(StpF1 x) { return exp2(x); }
 649    StpF1 StpLog2F1(StpF1 x) { return log2(x); }
 650//------------------------------------------------------------------------------------------------------------------------------
 651    StpMF1 StpExp2MF1(StpMF1 x) { return exp2(x); }
 652    StpMF1 StpLog2MF1(StpMF1 x) { return log2(x); }
 653//------------------------------------------------------------------------------------------------------------------------------
 654    #define STP_INFN_F StpF1_U1(0xff800000u)
 655    #define STP_INFP_F StpF1_U1(0x7f800000u)
 656    #if STP_BUG_SAT_INF
 657        // Defined if unable to use the fast path because of problem related to saturating +/- INF.
 658        StpF1 StpGtZeroF1(StpF1 x) { return (x > StpF1_(0.0)) ? StpF1_(1.0) : StpF1_(0.0); }
 659        StpF3 StpGtZeroF3(StpF3 x) { return StpF3(StpGtZeroF1(x.r), StpGtZeroF1(x.g), StpGtZeroF1(x.b)); }
 660        StpF4 StpGtZeroF4(StpF4 x) { return StpF4(StpGtZeroF1(x.r), StpGtZeroF1(x.g),
 661            StpGtZeroF1(x.b), StpGtZeroF1(x.a)); }
 662        StpF1 StpSignedF1(StpF1 x) { return (x < StpF1_(0.0)) ? StpF1_(1.0) : StpF1_(0.0); }
 663        StpF2 StpSignedF2(StpF2 x) { return StpF2(StpSignedF1(x.r), StpSignedF1(x.g)); }
 664        StpF3 StpSignedF3(StpF3 x) { return StpF3(StpSignedF1(x.r), StpSignedF1(x.g), StpSignedF1(x.b)); }
 665        StpF4 StpSignedF4(StpF4 x) { return StpF4(StpSignedF1(x.r), StpSignedF1(x.g),
 666            StpSignedF1(x.b), StpSignedF1(x.a)); }
 667    #else
 668        StpF1 StpGtZeroF1(StpF1 x) { return StpSatF1(x * StpF1_(STP_INFP_F)); }
 669        StpF3 StpGtZeroF3(StpF3 x) { return StpSatF3(x * StpF3_(STP_INFP_F)); }
 670        StpF4 StpGtZeroF4(StpF4 x) { return StpSatF4(x * StpF4_(STP_INFP_F)); }
 671        StpF1 StpSignedF1(StpF1 x) { return StpSatF1(x * StpF1_(STP_INFN_F)); }
 672        StpF2 StpSignedF2(StpF2 x) { return StpSatF2(x * StpF2_(STP_INFN_F)); }
 673        StpF3 StpSignedF3(StpF3 x) { return StpSatF3(x * StpF3_(STP_INFN_F)); }
 674        StpF4 StpSignedF4(StpF4 x) { return StpSatF4(x * StpF4_(STP_INFN_F)); }
 675    #endif // STP_BUG_SAT_INF
 676//------------------------------------------------------------------------------------------------------------------------------
 677    #if STP_BUG_PRX
 678        StpF1 StpPrxLoSqrtF1(StpF1 a) { return sqrt(a); }
 679        StpF3 StpPrxLoSqrtF3(StpF3 a) { return sqrt(a); }
 680        StpF4 StpPrxLoSqrtF4(StpF4 a) { return sqrt(a); }
 681    #else
 682        StpF1 StpPrxLoSqrtF1(StpF1 a) { return StpF1_U1((StpU1_F1(a) >> StpU1_(1)) + StpU1_(0x1fbc4639)); }
 683        StpF3 StpPrxLoSqrtF3(StpF3 a) { return StpF3_U3((StpU3_F3(a) >> StpU3_(1)) + StpU3_(0x1fbc4639)); }
 684        StpF4 StpPrxLoSqrtF4(StpF4 a) { return StpF4_U4((StpU4_F4(a) >> StpU4_(1)) + StpU4_(0x1fbc4639)); }
 685    #endif // STP_BUG_PRX
 686//------------------------------------------------------------------------------------------------------------------------------
 687    #if STP_BUG_PRX
 688        StpF1 StpPrxLoRcpF1(StpF1 a) { return StpRcpF1(a); }
 689        StpF2 StpPrxLoRcpF2(StpF2 a) { return StpRcpF2(a); }
 690        StpF3 StpPrxLoRcpF3(StpF3 a) { return StpRcpF3(a); }
 691        StpF4 StpPrxLoRcpF4(StpF4 a) { return StpRcpF4(a); }
 692        StpF1 StpPrxMedRcpF1(StpF1 a) { return StpRcpF1(a); }
 693        StpF3 StpPrxMedRcpF3(StpF3 a) { return StpRcpF3(a); }
 694    #else
 695        StpF1 StpPrxLoRcpF1(StpF1 a) { return StpF1_U1(StpU1_(0x7ef07ebb) - StpU1_F1(a)); }
 696        StpF2 StpPrxLoRcpF2(StpF2 a) { return StpF2_U2(StpU2_(0x7ef07ebb) - StpU2_F2(a)); }
 697        StpF3 StpPrxLoRcpF3(StpF3 a) { return StpF3_U3(StpU3_(0x7ef07ebb) - StpU3_F3(a)); }
 698        StpF4 StpPrxLoRcpF4(StpF4 a) { return StpF4_U4(StpU4_(0x7ef07ebb) - StpU4_F4(a)); }
 699        StpF1 StpPrxMedRcpF1(StpF1 a) { StpF1 b = StpF1_U1(StpU1_(0x7ef19fff) - StpU1_F1(a));
 700            return b * (-b * a + StpF1_(2.0)); }
 701        StpF3 StpPrxMedRcpF3(StpF3 a) { StpF3 b = StpF3_U3(StpU3_(0x7ef19fff) - StpU3_F3(a));
 702            return b * (-b * a + StpF3_(2.0)); }
 703    #endif // STP_BUG_PRX
 704//------------------------------------------------------------------------------------------------------------------------------
 705    #define STP_STATIC /* */
 706    #define StpInF2 in StpF2
 707    #define StpInF4 in StpF4
 708    #define StpInOutU4 inout StpU4
 709    #define StpOutF2 out StpF2
 710    #define StpVarF2 StpF2
 711#endif // defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL))
 712//==============================================================================================================================
 713#if defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL)) && defined(STP_MEDIUM)
 714    #if STP_BUG_SAT_INF
 715        // Defined if unable to use the fast path because of problem related to saturating +/- INF.
 716        StpMF1 StpGtZeroMF1(StpMF1 x) { return (x > StpMF1_(0.0)) ? StpMF1_(1.0) : StpMF1_(0.0); }
 717        StpMF3 StpGtZeroMF3(StpMF3 x) { return StpMF3(StpGtZeroMF1(x.r), StpGtZeroMF1(x.g), StpGtZeroMF1(x.b)); }
 718        StpMF4 StpGtZeroMF4(StpMF4 x) { return StpMF4(StpGtZeroMF1(x.r), StpGtZeroMF1(x.g),
 719            StpGtZeroMF1(x.b), StpGtZeroMF1(x.a)); }
 720        StpMF1 StpSignedMF1(StpMF1 x) { return (x < StpMF1_(0.0)) ? StpMF1_(1.0) : StpMF1_(0.0); }
 721        StpMF2 StpSignedMF2(StpMF2 x) { return StpMF2(StpSignedMF1(x.r), StpSignedMF1(x.g)); }
 722        StpMF3 StpSignedMF3(StpMF3 x) { return StpMF3(StpSignedMF1(x.r), StpSignedMF1(x.g), StpSignedMF1(x.b)); }
 723        StpMF4 StpSignedMF4(StpMF4 x) { return StpMF4(StpSignedMF1(x.r), StpSignedMF1(x.g),
 724            StpSignedMF1(x.b), StpSignedMF1(x.a)); }
 725    #elif STP_BUG_SAT
 726        // Defined if compiler factors out saturation incorrectly.
 727        #define STP_INFN_MF StpMF1(StpF1_U1(0xff800000u))
 728        #define STP_INFP_MF StpMF1(StpF1_U1(0x7f800000u))
 729        StpMF1 StpGtZeroMF1(StpMF1 x) { return max(min(x * StpMF1_(STP_INFP_MF), StpMF1_(1.0)), StpMF1_(0.0)); }
 730        StpMF3 StpGtZeroMF3(StpMF3 x) { return max(min(x * StpMF3_(STP_INFP_MF), StpMF3_(1.0)), StpMF3_(0.0)); }
 731        StpMF4 StpGtZeroMF4(StpMF4 x) { return max(min(x * StpMF4_(STP_INFP_MF), StpMF4_(1.0)), StpMF4_(0.0)); }
 732        StpMF1 StpSignedMF1(StpMF1 x) { return max(min(x * StpMF1_(STP_INFN_MF), StpMF1_(1.0)), StpMF1_(0.0)); }
 733        StpMF2 StpSignedMF2(StpMF2 x) { return max(min(x * StpMF2_(STP_INFN_MF), StpMF2_(1.0)), StpMF2_(0.0)); }
 734        StpMF3 StpSignedMF3(StpMF3 x) { return max(min(x * StpMF3_(STP_INFN_MF), StpMF3_(1.0)), StpMF3_(0.0)); }
 735        StpMF4 StpSignedMF4(StpMF4 x) { return max(min(x * StpMF4_(STP_INFN_MF), StpMF4_(1.0)), StpMF4_(0.0)); }
 736    #else
 737        // Using +/- INF typecast down to medium precision.
 738        #define STP_INFN_MF StpMF1(StpF1_U1(0xff800000u))
 739        #define STP_INFP_MF StpMF1(StpF1_U1(0x7f800000u))
 740        StpMF1 StpGtZeroMF1(StpMF1 x) { return StpSatMF1(x * StpMF1_(STP_INFP_MF)); }
 741        StpMF3 StpGtZeroMF3(StpMF3 x) { return StpSatMF3(x * StpMF3_(STP_INFP_MF)); }
 742        StpMF4 StpGtZeroMF4(StpMF4 x) { return StpSatMF4(x * StpMF4_(STP_INFP_MF)); }
 743        StpMF1 StpSignedMF1(StpMF1 x) { return StpSatMF1(x * StpMF1_(STP_INFN_MF)); }
 744        StpMF2 StpSignedMF2(StpMF2 x) { return StpSatMF2(x * StpMF2_(STP_INFN_MF)); }
 745        StpMF3 StpSignedMF3(StpMF3 x) { return StpSatMF3(x * StpMF3_(STP_INFN_MF)); }
 746        StpMF4 StpSignedMF4(StpMF4 x) { return StpSatMF4(x * StpMF4_(STP_INFN_MF)); }
 747    #endif // STP_BUG_SAT_INF
 748//------------------------------------------------------------------------------------------------------------------------------
 749    // Unable to use the approximations due to not knowing what the type actually is.
 750    StpMF1 StpPrxLoSqrtMF1(StpMF1 a) { return sqrt(a); }
 751    StpMF3 StpPrxLoSqrtMF3(StpMF3 a) { return sqrt(a); }
 752    StpMF4 StpPrxLoSqrtMF4(StpMF4 a) { return sqrt(a); }
 753//------------------------------------------------------------------------------------------------------------------------------
 754    StpMF1 StpPrxLoRcpMF1(StpMF1 a) { return StpRcpMF1(a); }
 755    StpMF2 StpPrxLoRcpMF2(StpMF2 a) { return StpRcpMF2(a); }
 756    StpMF3 StpPrxLoRcpMF3(StpMF3 a) { return StpRcpMF3(a); }
 757    StpMF4 StpPrxLoRcpMF4(StpMF4 a) { return StpRcpMF4(a); }
 758    StpMF1 StpPrxMedRcpMF1(StpMF1 a) { return StpRcpMF1(a); }
 759    StpMF3 StpPrxMedRcpMF3(StpMF3 a) { return StpRcpMF3(a); }
 760#endif // defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL)) && defined(STP_MEDIUM)
 761//==============================================================================================================================
 762#if defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL)) && (!defined(STP_MEDIUM))
 763    // Same types so just use the full precision version.
 764    #define StpGtZeroMF1(a) StpGtZeroF1(a)
 765    #define StpGtZeroMF2(a) StpGtZeroF2(a)
 766    #define StpGtZeroMF3(a) StpGtZeroF3(a)
 767    #define StpGtZeroMF4(a) StpGtZeroF4(a)
 768    #define StpSignedMF1(a) StpSignedF1(a)
 769    #define StpSignedMF2(a) StpSignedF2(a)
 770    #define StpSignedMF3(a) StpSignedF3(a)
 771    #define StpSignedMF4(a) StpSignedF4(a)
 772//------------------------------------------------------------------------------------------------------------------------------
 773    // The medium precision types are the same as the full precision so use the full precision approximations.
 774    #define StpPrxLoSqrtMF1(a) StpPrxLoSqrtF1(a)
 775    #define StpPrxLoSqrtMF3(a) StpPrxLoSqrtF3(a)
 776    #define StpPrxLoSqrtMF4(a) StpPrxLoSqrtF4(a)
 777//------------------------------------------------------------------------------------------------------------------------------
 778    #define StpPrxLoRcpMF1(a) StpPrxLoRcpF1(a)
 779    #define StpPrxLoRcpMF2(a) StpPrxLoRcpF2(a)
 780    #define StpPrxLoRcpMF3(a) StpPrxLoRcpF3(a)
 781    #define StpPrxLoRcpMF4(a) StpPrxLoRcpF4(a)
 782    #define StpPrxMedRcpMF1(a) StpPrxMedRcpF1(a)
 783    #define StpPrxMedRcpMF3(a) StpPrxMedRcpF3(a)
 784#endif // defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL)) && (!defined(STP_MEDIUM))
 785//==============================================================================================================================
 786#if defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL)) && defined(STP_16BIT)
 787    StpH1 StpExp2H1(StpH1 x) { return exp2(x); }
 788    StpH1 StpLog2H1(StpH1 x) { return log2(x); }
 789//------------------------------------------------------------------------------------------------------------------------------
 790    #if STP_BUG_ALIAS16
 791        // Use 32-bit aliasing to build the +/-INF, then typecast to 16-bit.
 792        #define STP_INFN_H StpH1(StpF1_U1(0xff800000u))
 793        #define STP_INFP_H StpH1(StpF1_U1(0x7f800000u))
 794    #else
 795        #define STP_INFN_H StpH1_W1(StpW1_(0xfc00))
 796        #define STP_INFP_H StpH1_W1(StpW1_(0x7c00))
 797    #endif // STP_BUG_ALIAS16
 798    #if STP_BUG_SAT_INF
 799        StpH1 StpGtZeroH1(StpH1 x) { return (x > StpH1_(0.0)) ? StpH1_(1.0) : StpH1_(0.0); }
 800        StpH2 StpGtZeroH2(StpH2 x) { return StpH2(StpGtZeroH1(x.r), StpGtZeroH1(x.g)); }
 801        StpH3 StpGtZeroH3(StpH3 x) { return StpH3(StpGtZeroH1(x.r), StpGtZeroH1(x.g), StpGtZeroH1(x.b)); }
 802        StpH4 StpGtZeroH4(StpH4 x) { return StpH4(StpGtZeroH1(x.r), StpGtZeroH1(x.g),
 803            StpGtZeroH1(x.b), StpGtZeroH1(x.a)); }
 804        StpH1 StpSignedH1(StpH1 x) { return (x < StpH1_(0.0)) ? StpH1_(1.0) : StpH1_(0.0); }
 805        StpH2 StpSignedH2(StpH2 x) { return StpH2(StpSignedH1(x.r), StpSignedH1(x.g)); }
 806        StpH3 StpSignedH3(StpH3 x) { return StpH3(StpSignedH1(x.r), StpSignedH1(x.g), StpSignedH1(x.b)); }
 807        StpH4 StpSignedH4(StpH4 x) { return StpH4(StpSignedH1(x.r), StpSignedH1(x.g),
 808            StpSignedH1(x.b), StpSignedH1(x.a)); }
 809    #elif STP_BUG_SAT
 810        StpH1 StpGtZeroH1(StpH1 x) { return max(min(x * StpH1_(STP_INFP_H), StpH1_(1.0)), StpH1_(0.0)); }
 811        StpH2 StpGtZeroH2(StpH2 x) { return max(min(x * StpH2_(STP_INFP_H), StpH2_(1.0)), StpH2_(0.0)); }
 812        StpH3 StpGtZeroH3(StpH3 x) { return max(min(x * StpH3_(STP_INFP_H), StpH3_(1.0)), StpH3_(0.0)); }
 813        StpH4 StpGtZeroH4(StpH4 x) { return max(min(x * StpH4_(STP_INFP_H), StpH4_(1.0)), StpH4_(0.0)); }
 814        StpH1 StpSignedH1(StpH1 x) { return max(min(x * StpH1_(STP_INFN_H), StpH1_(1.0)), StpH1_(0.0)); }
 815        StpH2 StpSignedH2(StpH2 x) { return max(min(x * StpH2_(STP_INFN_H), StpH2_(1.0)), StpH2_(0.0)); }
 816        StpH3 StpSignedH3(StpH3 x) { return max(min(x * StpH3_(STP_INFN_H), StpH3_(1.0)), StpH3_(0.0)); }
 817        StpH4 StpSignedH4(StpH4 x) { return max(min(x * StpH4_(STP_INFN_H), StpH4_(1.0)), StpH4_(0.0)); }
 818    #else
 819        StpH1 StpGtZeroH1(StpH1 x) { return StpSatH1(x * StpH1_(STP_INFP_H)); }
 820        StpH2 StpGtZeroH2(StpH2 x) { return StpSatH2(x * StpH2_(STP_INFP_H)); }
 821        StpH3 StpGtZeroH3(StpH3 x) { return StpSatH3(x * StpH3_(STP_INFP_H)); }
 822        StpH4 StpGtZeroH4(StpH4 x) { return StpSatH4(x * StpH4_(STP_INFP_H)); }
 823        StpH1 StpSignedH1(StpH1 x) { return StpSatH1(x * StpH1_(STP_INFN_H)); }
 824        StpH2 StpSignedH2(StpH2 x) { return StpSatH2(x * StpH2_(STP_INFN_H)); }
 825        StpH3 StpSignedH3(StpH3 x) { return StpSatH3(x * StpH3_(STP_INFN_H)); }
 826        StpH4 StpSignedH4(StpH4 x) { return StpSatH4(x * StpH4_(STP_INFN_H)); }
 827    #endif // STP_BUG_SAT_INF
 828//------------------------------------------------------------------------------------------------------------------------------
 829    #if STP_BUG_PRX
 830        StpH1 StpPrxLoSqrtH1(StpH1 a) { return sqrt(a); }
 831        StpH3 StpPrxLoSqrtH3(StpH3 a) { return sqrt(a); }
 832        StpH4 StpPrxLoSqrtH4(StpH4 a) { return sqrt(a); }
 833    #else
 834        StpH1 StpPrxLoSqrtH1(StpH1 a) { return StpH1_W1((StpW1_H1(a) >> StpW1_(1)) + StpW1_(0x1de2)); }
 835        StpH3 StpPrxLoSqrtH3(StpH3 a) { return StpH3_W3((StpW3_H3(a) >> StpW3_(1)) + StpW3_(0x1de2)); }
 836        StpH4 StpPrxLoSqrtH4(StpH4 a) { return StpH4_W4((StpW4_H4(a) >> StpW4_(1)) + StpW4_(0x1de2)); }
 837    #endif // STP_BUG_PRX
 838//------------------------------------------------------------------------------------------------------------------------------
 839    #if STP_BUG_PRX
 840        StpH1 StpPrxLoRcpH1(StpH1 a) { return StpRcpH1(a); }
 841        StpH2 StpPrxLoRcpH2(StpH2 a) { return StpRcpH2(a); }
 842        StpH3 StpPrxLoRcpH3(StpH3 a) { return StpRcpH3(a); }
 843        StpH4 StpPrxLoRcpH4(StpH4 a) { return StpRcpH4(a); }
 844        StpH1 StpPrxMedRcpH1(StpH1 a) { return StpRcpH1(a); }
 845        StpH3 StpPrxMedRcpH3(StpH3 a) { return StpRcpH3(a); }
 846    #else
 847        // Note this will create denormals.
 848        //  MAPPING
 849        //  -------
 850        //   +INF (7c00) -> -61568
 851        //  65504 (7bff) -> -61600
 852        //  30800 (7785) -> NaN
 853        //  30784 (7784) -> 0 ........ (any input larger than 30784 will break)
 854        //  1     (3c00) -> 0.9395 ... (so not energy preserving for 1.0)
 855        //  0     (0000) -> 30784
 856        StpH1 StpPrxLoRcpH1(StpH1 a) { return StpH1_W1(StpW1_(0x7784) - StpW1_H1(a)); }
 857        StpH2 StpPrxLoRcpH2(StpH2 a) { return StpH2_W2(StpW2_(0x7784) - StpW2_H2(a)); }
 858        StpH3 StpPrxLoRcpH3(StpH3 a) { return StpH3_W3(StpW3_(0x7784) - StpW3_H3(a)); }
 859        StpH4 StpPrxLoRcpH4(StpH4 a) { return StpH4_W4(StpW4_(0x7784) - StpW4_H4(a)); }
 860        // Anything larger than 30928 will break in this function.
 861        StpH1 StpPrxMedRcpH1(StpH1 a) { StpH1 b = StpH1_W1(StpW1_(0x778d) - StpW1_H1(a));
 862            return b * (-b * a + StpH1_(2.0)); }
 863        StpH3 StpPrxMedRcpH3(StpH3 a) { StpH3 b = StpH3_W3(StpW3_(0x778d) - StpW3_H3(a));
 864            return b * (-b * a + StpH3_(2.0)); }
 865    #endif // STP_BUG_PRX
 866#endif // defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL)) && defined(STP_16BIT)
 867////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 868////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 869//_____________________________________________________________.._______________________________________________________________
 870//==============================================================================================================================
 871//                                                        LANE REMAPPING
 872//==============================================================================================================================
 873#if defined(STP_GPU)
 874    // More complex remap which is safe for both portability (different wave sizes up to 128) and for 2D wave reductions.
 875    //  6543210
 876    //  =======
 877    //  ..xx..x
 878    //  yy..yy.
 879    // Details,
 880    //  LANE TO 8x16 MAPPING
 881    //  ====================
 882    //  00 01 08 09 10 11 18 19
 883    //  02 03 0a 0b 12 13 1a 1b
 884    //  04 05 0c 0d 14 15 1c 1d
 885    //  06 07 0e 0f 16 17 1e 1f
 886    //  20 21 28 29 30 31 38 39
 887    //  22 23 2a 2b 32 33 3a 3b
 888    //  24 25 2c 2d 34 35 3c 3d
 889    //  26 27 2e 2f 36 37 3e 3f
 890    //  .......................
 891    //  ... repeat the 8x8 ....
 892    //  .... pattern, but .....
 893    //  .... for 40 to 7f .....
 894    //  .......................
 895    StpU2 StpRmp8x16U2(StpU1 a) {
 896        // Note the BFIs used for MSBs have "strange offsets" due to leaving space for the LSB bits replaced in the BFI.
 897        return StpU2(StpBfiMskU1(StpBfeU1(a, 2u, 3u), a, 1u),
 898            StpBfiMskU1(StpBfeU1(a, 3u, 4u), StpBfeU1(a, 1u, 2u), 2u)); }
 899#endif // defined(STP_GPU)
 900////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 901////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 902//_____________________________________________________________.._______________________________________________________________
 903//==============================================================================================================================
 904//                                                     PRESETS (DON'T CHANGE)
 905//==============================================================================================================================
 906// High-end mobile.
 907#if (STP_TAA_Q == 0)
 908    #define STP_GEAA_P 1
 909    #define STP_GEAA_SUBPIX (2.0 / 16.0)
 910    #define STP_TAA_PEN_F1 (1.0 / 4.0)
 911    #define STP_TAA_PEN_F0 (1.0 / 2.0)
 912    #define STP_TAA_PEN_W (1.0 / 2.0)
 913    #define STP_TAA_PRX_LANCZOS 1
 914    #define STP_TAA_PRX_LANCZOS_DERING 0
 915#endif // (STP_TAA_Q == 0)
 916//------------------------------------------------------------------------------------------------------------------------------
 917// Desktop.
 918#if (STP_TAA_Q == 1)
 919    #define STP_GEAA_P 3
 920    #define STP_GEAA_SUBPIX (2.0 / 16.0)
 921    #define STP_TAA_PEN_F1 (1.0 / 4.0)
 922    #define STP_TAA_PEN_F0 (1.0 / 2.0)
 923    #define STP_TAA_PEN_W (1.0 / 2.0)
 924    #define STP_TAA_PRX_LANCZOS 2
 925    #define STP_TAA_PRX_LANCZOS_DERING 1
 926#endif // (STP_TAA_Q == 1)
 927////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 928////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 929//_____________________________________________________________.._______________________________________________________________
 930//==============================================================================================================================
 931//                                               INTERNAL TUNING (DON'T CHANGE)
 932//==============================================================================================================================
 933// Limits on anti-flicker weighting, tuning for range and precision challenges of FP16.
 934#define STP_ANTI_MAX 8192.0
 935// Using '1/8192' provides known problems on some platforms that are 16-bit precision challenged.
 936#define STP_ANTI_MIN (1.0 / 4096.0)
 937//------------------------------------------------------------------------------------------------------------------------------
 938#define STP_DITHER_DEPTH 1
 939#define STP_DITHER_MOTION 1
 940//------------------------------------------------------------------------------------------------------------------------------
 941// Ratios for luma in a gamma space, using BT.709 luma.
 942#define STP_LUMA_R 0.2126
 943#define STP_LUMA_G 0.7152
 944#define STP_LUMA_B 0.0722
 945#define STP_LUMA STP_LUMA_R, STP_LUMA_G, STP_LUMA_B
 946//------------------------------------------------------------------------------------------------------------------------------
 947// Maximum frames of feedback.
 948#define STP_FRAME_MAX 32.0
 949//------------------------------------------------------------------------------------------------------------------------------
 950// Control the min (motion match), and max (no motion match), in units of pixels.
 951// Settings of {max=1.0} won't work for 8x area scaling (trailing edge smears).
 952// Setting too tight won't have enough slop for motion matching (motion match easily fails, leading to loss of detail).
 953// If STP_PAT_MOT_MAX is too big, it will look like edges expand (or float) during change of motion.
 954#define STP_PAT_MOT_MIN (1.0 / 16.0)
 955#define STP_PAT_MOT_MAX (1.0 / 8.0)
 956// Computed constants.
 957#define STP_PAT_MOT_ADD (STP_PAT_MOT_MIN * STP_PAT_MOT_MIN)
 958#define STP_PAT_MOT_AMP (1.0 / (STP_PAT_MOT_MAX * STP_PAT_MOT_MAX - STP_PAT_MOT_ADD))
 959//------------------------------------------------------------------------------------------------------------------------------
 960// Larger numbers ghost more, smaller numbers flicker more.
 961#define STP_PAT_DEMOIRE 64.0
 962// Increase for less ghosting, decrease for more ghosting.
 963#define STP_PAT_SENSITIVITY (2.0 / 16.0)
 964// Amount to scale up sensitivity on responsive. Lower numbers ghost more, higher flicker more.
 965#define STP_PAT_RESPONSIVE 16.0
 966// Minimum neighborhood (defaults to 1/32 of maximum value of neighborhood to allow some noise).
 967#define STP_PAT_NE_MIN (1.0 / 32.0)
 968//------------------------------------------------------------------------------------------------------------------------------
 969// {0} = default lowest dilation (higher chance of slight trailing ghost, but less overall flicker)
 970// {1} = expand a little (higher cost)
 971// {2} = expand by too much (a lot more cost, more flicker, perhaps less trailing ghost)
 972// In practice it's dilation and motion match threshold (PAT_MOT) which results in the final {flicker, ghost} tradeoff.
 973#define STP_SAFE_DILATE 1
 974//------------------------------------------------------------------------------------------------------------------------------
 975// Adjusts the point at which spatial-only weights blend up and anti-flicker fully takes over.
 976#define STP_TAA_SAA (1.0 / 2.0)
 977// De-weight pixel contribution for chopped corner.
 978#define STP_TAA_TRI_MASK_AVOID (1.0 / 8192.0)
 979////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 980////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 981//_____________________________________________________________.._______________________________________________________________
 982//==============================================================================================================================
 983//                                                      JITTER LOCATIONS
 984//------------------------------------------------------------------------------------------------------------------------------
 985// STP is now using Halton(2,3).
 986//==============================================================================================================================
 987// Generate jitter amount given frame index.
 988STP_STATIC void StpJit(StpOutF2 p, StpU1 frame) {
 989    // TODO: This function isn't used inside Unity, if ever this is used the implementation should be added here.
 990    p[0] = StpF1_(0.0);
 991    p[1] = StpF1_(0.0); }
 992////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 993////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 994//_____________________________________________________________.._______________________________________________________________
 995//==============================================================================================================================
 996//                                                     PARABOLIC {SIN,COS}
 997//==============================================================================================================================
 998#if defined(STP_GPU)
 999    // Input is {-1 to 1} representing {0 to 2 pi}, output is {-1/4 to 1/4} representing {-1 to 1}.
1000    void StpPSinF2(inout StpF2 p) { p = p * abs(p) - p; }
1001    // This is used to dither position of gather4 fetch for nearest motion vector to remove nearest artifacts when scaling.
1002    // Input 'p.x' is {0 to 1} representing {0 to 2 pi}, output is {-1/4 to 1/4} representing {-1 to 1}.
1003    void StpPSinCosF(inout StpF2 p) { p.y = StpFractF1(p.x + StpF1_(0.25)); p = p * StpF2_(2.0) - StpF2_(1.0); StpPSinF2(p); }
1004//------------------------------------------------------------------------------------------------------------------------------
1005    void StpPSinMF2(inout StpMF2 p) { p = p * abs(p) - p; }
1006    void StpPSinCosMF(inout StpMF2 p) {
1007        p.y = StpFractMF1(p.x + StpMF1_(0.25));
1008        p = p * StpMF2_(2.0) - StpMF2_(1.0); StpPSinMF2(p); }
1009#endif // defined(STP_GPU)
1010//==============================================================================================================================
1011#if defined(STP_GPU) && defined(STP_16BIT)
1012    void StpPSinH2(inout StpH2 p) { p = p * abs(p) - p; }
1013    void StpPSinCosH(inout StpH2 p) { p.y = StpFractH1(p.x + StpH1_(0.25)); p = p * StpH2_(2.0) - StpH2_(1.0); StpPSinH2(p); }
1014#endif // defined(STP_GPU) && defined(STP_16BIT)
1015////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1016////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1017//_____________________________________________________________.._______________________________________________________________
1018//==============================================================================================================================
1019//                                                        DEPTH ENCODING
1020//------------------------------------------------------------------------------------------------------------------------------
1021// Using a log2() based encoding, takes {0 to inf} to {0 to 1}.
1022//  log2(k.x*z)*k.y
1023// Where
1024//  k.x = 1/near ............ (so that k0*z is 1 when z=near)
1025//  k.y = 1/log2(k.x*far) ... (so that output is {0 to 1} ranged)
1026//------------------------------------------------------------------------------------------------------------------------------
1027// And the inverse
1028//  exp2(x*k.x)*k.y
1029// Where
1030//  k.x = log2(far/near)
1031//  k.y = near
1032//==============================================================================================================================
1033#if defined(STP_GPU)
1034    // Build the constants, based on near and far planes.
1035    // The 'far' is where anything more distant clamps to 1.0.
1036    StpF2 StpZCon(StpF1 near, StpF1 far) {
1037        StpF2 k;
1038        k.x = StpRcpF1(near);
1039        k.y = StpRcpF1(log2(k.x * far));
1040        return k; }
1041//------------------------------------------------------------------------------------------------------------------------------
1042    // Where 'k' is generated by StpZCon().
1043    StpF1 StpZPack(StpF1 z, StpF2 k, StpF1 dit) {
1044        #if (STP_DITHER_DEPTH == 0)
1045            return StpSatF1(log2(k.x * z) * k.y);
1046        #endif // (STP_DITHER_DEPTH == 0)
1047        #if (STP_DITHER_DEPTH == 1)
1048            // Fast linearly incorrect dither for 10-bit.
1049            return StpSatF1(log2(k.x * z) * k.y + dit * StpF1_(1.0 / 1024.0) - StpF1_(0.5 / 1024.0));
1050        #endif // (STP_DITHER_DEPTH == 1)
1051    }
1052//==============================================================================================================================
1053    // Build the constants, based on near and far planes.
1054    // The 'far' is where anything more distant clamps to 1.0.
1055    StpF2 StpZUnCon(StpF1 near, StpF1 far) {
1056        StpF2 k;
1057        k.x = log2(far * StpRcpF1(near));
1058        k.y = near;
1059        return k; }
1060//------------------------------------------------------------------------------------------------------------------------------
1061    // Where 'k' is generated by StpZUnCon().
1062    StpF1 StpZUnpack(StpF1 x, StpF2 k) { return exp2(x * k.x) * k.y; }
1063#endif // defined(STP_GPU)
1064////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1065////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1066//_____________________________________________________________.._______________________________________________________________
1067//==============================================================================================================================
1068//                                            STATIC GEOMETRY MOTION FORWARD PROJECTION
1069//==============================================================================================================================
1070// This is a separate section simply for documentation.
1071// This logic must be computed in 32-bit precision (in theory).
1072//------------------------------------------------------------------------------------------------------------------------------
1073// MOTION MATCH NOTES
1074// ==================
1075// - The 'position - motion' is the reprojected position.
1076// - Where {0 to 1} is no motion to a screen in motion.
1077// - Motion check works with a differential vector '((motionPrior - motionCurrent) * kC)'.
1078// - For static forward projection it will be '((motionPrior*0.5 - motionCurrent) * kC)'.
1079//    - Due to motionPrior being in {-1 to 1} NDC instead of {0 to 1} for screen.
1080// - Working with motion vector differences to avoid complexity with jitter.
1081//------------------------------------------------------------------------------------------------------------------------------
1082// MOTION VECTOR NOTES
1083// ===================
1084// - 'reprojection = position - motion'
1085// - 'reprojection + motion = position'
1086// - 'motion = position - reprojection'
1087// - So motion points forward.
1088//------------------------------------------------------------------------------------------------------------------------------
1089// FORWARD PROJECTION LOGIC
1090// ========================
1091// HAVE INPUT {0 TO 1} SCREEN POSITION
1092//  xy
1093// GET XY INTO {-1 TO 1} NDC [2 FMA, CANNOT FACTOR, NEED AT END]
1094//  x=x*2-1
1095//  y=y*2-1
1096// HAVE INPUT {0 TO INF} DEPTH
1097//  z
1098// GET FROM {XY NDC, DEPTH} TO 3D VIEW POSITION [4 FMA]
1099//  xx=x*((z*g+h)/a) ... xx=x*(z*(g/a)+(h/a)) ... xx=x*(z*k0+k1)
1100//  yy=y*((z*g+h)/b) ... yy=y*(z*(g/b)+(h/b)) ... yy=y*(z*k2+k3)
1101// TRANSFORM TO NEW VIEW
1102//  xxx=xx*i+yy*j+z*k+l
1103//  yyy=xx*m+yy*n+z*o+p
1104//  zzz=xx*q+yy*r+z*s+t
1105// PROJECTION [9 FMA]
1106//  xxxx=xxx*a ..... xxxx=xx*(i*a)+yy*(j*a)+z*(k*a)+(l*a) ..... xxxx=xx*k4+yy*k5+z*k6+k7
1107//  yyyy=yyy*b ..... yyyy=xx*(m*b)+yy*(n*b)+z*(o*b)+(p*b) ..... yyyy=xx*k8+yy*k9+z*kA+kB
1108//  wwww=zzz*g+h ... wwww=xx*(q*g)+yy*(r*g)+z*(s*g)+(t*g+h) ... wwww=xx*kC+yy*kD+z*kE+kF
1109// PERSPECTIVE DIVIDE [1 RCP]
1110//  xxxxx=xxxx/wwww
1111//  yyyyy=yyyy/wwww
1112// SUBTRACT TO GET 2X MOTION [2 FMA]
1113//  u=xxxxx-x ... u=xxxx*(1/wwww)-x
1114//  v=yyyyy-y ... v=yyyy*(1/wwww)-y
1115// CONSTANTS (SEE BELOW FOR MEANING OF VARIABLES)
1116//  k0=g/a ... Constants {a,b,c,d,g,h} for prior projection
1117//  k1=h/a
1118//  k2=g/b
1119//  k3=h/b
1120//  k4=i*a ... Constants {a,b,c,d,g,h} for next projection
1121//  k5=j*a
1122//  k6=k*a
1123//  k7=l*a
1124//  k8=m*b
1125//  k9=n*b
1126//  kA=o*b
1127//  kB=p*b
1128//  kC=q*g
1129//  kD=r*g
1130//  kE=s*g
1131//  kF=t*g+h
1132//------------------------------------------------------------------------------------------------------------------------------
1133// BACKWARD PROJECTION LOGIC
1134// =========================
1135//  This starts from '3D VIEW POSITION' of 'FORWARD PROJECTION LOGIC', but with different constants.
1136// TRANSFORM TO NEW VIEW
1137//  xxx=xx*i+yy*j+z*k+l
1138//  yyy=xx*m+yy*n+z*o+p
1139//  zzz=xx*q+yy*r+z*s+t
1140// PROJECTION [9 FMA]
1141//  xxxx=xxx*a ..... xxxx=xx*(i*a)+yy*(j*a)+z*(k*a)+(l*a) ..... xxxx=xx*kG+yy*kH+z*kI+kJ
1142//  yyyy=yyy*b ..... yyyy=xx*(m*b)+yy*(n*b)+z*(o*b)+(p*b) ..... yyyy=xx*kK+yy*kL+z*kM+kN
1143//  wwww=zzz*g+h ... wwww=xx*(q*g)+yy*(r*g)+z*(s*g)+(t*g+h) ... wwww=xx*kO+yy*kP+z*kQ+kR
1144// PERSPECTIVE DIVIDE [1 RCP]
1145//  xxxxx=xxxx/wwww
1146//  yyyyy=yyyy/wwww
1147// SUBTRACT TO GET 2X MOTION [2 FMA]
1148//  u=xxxxx-x ... u=xxxx*(1/wwww)-x
1149//  v=yyyyy-y ... v=yyyy*(1/wwww)-y
1150// CONSTANTS (SEE BELOW FOR MEANING OF VARIABLES)
1151//  kG=i*a ... Constants {a,b,c,d,g,h} for previous prior projection, and {i,j,k,l,m,n,o,p,q,r,s,t} for prior back projection
1152//  kH=j*a
1153//  kI=k*a
1154//  kJ=l*a
1155//  kK=m*b
1156//  kL=n*b
1157//  kM=o*b
1158//  kN=p*b
1159//  kO=q*g
1160//  kP=r*g
1161//  kQ=s*g
1162//  kR=t*g+h
1163//==============================================================================================================================
1164// GET FROM {0 TO 1} TO {-1 TO 1}
1165// ==============================
1166// - Get to NDC for {x,y}
1167//   X:=x*2-1
1168//   Y:=y*2-1
1169//------------------------------------------------------------------------------------------------------------------------------
1170// FORWARD VIEW
1171// ============
1172// - Using 12 values
1173//    X:=x*i+y*j+z*k+l
1174//    Y:=x*m+y*n+z*o+p
1175//    Z:=x*q+y*r+z*s+t
1176//    W:=1
1177//     i j k l
1178//     m n o p
1179//     q r s t
1180//     0 0 0 1
1181//------------------------------------------------------------------------------------------------------------------------------
1182// PROJECTIONS
1183// ===========
1184// - INPUTS
1185//    n ... near plane z
1186//    f ... far plane z
1187// - DX ORTHO PROJECTION
1188//    c:=1/(f-n)
1189//    d:=-n/(f-n)
1190//    X:=x*a
1191//    Y:=y*b
1192//    Z:=z*c+d ... (w=1 on input)
1193//    W:=1
1194//     a 0 0 0
1195//     0 b 0 0
1196//     0 0 c d
1197//     0 0 0 1
1198// - DX PERSPECTIVE PROJECTION (LEFT HANDED)
1199//    c:=f/(f-n)
1200//    d:=-(f*n)/(f-n)
1201//    X:=x*a
1202//    Y:=y*b
1203//    Z:=z*c+d ... (w=1 on input)
1204//    W:=z
1205//     a 0 0 0
1206//     0 b 0 0
1207//     0 0 c d
1208//     0 0 1 0 ... (note DX allows the 1 to be non-one)
1209// - DX PERSPECTIVE PROJECTION REVERSED FOR BETTER PRECISION (LEFT HANDED)
1210//    c:=-n/(f-n)
1211//    d:=(f*n)/(f-n)
1212//    X:=x*a
1213//    Y:=y*b
1214//    Z:=z*c+d ... (w=1 on input)
1215//    W:=z
1216//     a 0 0 0
1217//     0 b 0 0
1218//     0 0 c d
1219//     0 0 1 0
1220// - DX PERSPECTIVE PROJECTION REVERSED WITH INF FAR (LEFT HANDED)
1221//    X:=x*a
1222//    Y:=y*b
1223//    Z:=n ... (w=1 on input)
1224//    W:=z
1225//    a 0 0 0
1226//    0 b 0 0
1227//    0 0 0 n
1228//    0 0 1 0
1229// - GL PERSPECTIVE PROJECTION
1230//    c:=-(f+n)/(f-n)
1231//    d:=-(2fn)/(f-n)
1232//    X:=x*a
1233//    Y:=y*b
1234//    Z:=z*c+d ... (w=1 on input)
1235//    W:=z
1236//     a 0  0 0
1237//     0 b  0 0
1238//     0 0  c d
1239//     0 0 -1 0
1240// - GENERALIZED (WILL DO ANYTHING)
1241//    X:=x*a
1242//    Y:=y*b
1243//    Z:=z*c+d ... (w=1 on input)
1244//    W:=z*g+h
1245//     a 0 0 0
1246//     0 b 0 0
1247//     0 0 c d
1248//     0 0 g h
1249//------------------------------------------------------------------------------------------------------------------------------
1250// PROJECTED TO NDC
1251// ================
1252// - Ignoring viewport transform
1253//    X:=x/w
1254//    Y:=y/w
1255//    Z:=z/w
1256//    W:=1/w
1257// - Inverse
1258//    x=X*w
1259//    y=Y*w
1260//==============================================================================================================================
1261//                                             MODIFICATIONS FOR COMPLEX PROJECTIONS
1262//------------------------------------------------------------------------------------------------------------------------------
1263// Since this worked out to just 2 more FMAs and 2 more constants, decided not to create a shader permutation.
1264//==============================================================================================================================
1265// COMPLEX PROJECTION
1266// ==================
1267// - GL PERSPECTIVE PROJECTION - WITH Z BASED {X,Y} MODIFICATIONS
1268//    c:=-(F+N)/(F-N)
1269//    d:=-(2FN)/(F-N)
1270//    X:=x*a + z*e
1271//    Y:=y*b + z*f
1272//    Z:=z*c+d ... (w=1 on input)
1273//    W:=z
1274//     a 0  e 0
1275//     0 b  f 0
1276//     0 0  c d
1277//     0 0 -1 0
1278// - GENERALIZED (WILL DO ANYTHING) - WITH Z BASED {X,Y} MODIFICATIONS
1279//    X:=x*a + z*e
1280//    Y:=y*b + z*f
1281//    Z:=z*c+d ... (w=1 on input)
1282//    W:=z*g+h
1283//     a 0 e 0
1284//     0 b f 0
1285//     0 0 c d
1286//     0 0 g h
1287// - INVERSE GIVEN 'z'
1288//    X:=x*a + z*e
1289//    Y:=y*b + z*f
1290//    X - z*e:=x*a
1291//    Y - z*f:=y*b
1292//    X/a - z*e/a:=x
1293//    Y/b - z*f/b:=y
1294//------------------------------------------------------------------------------------------------------------------------------
1295// FORWARD PROJECTION LOGIC
1296// ========================
1297// HAVE INPUT {0 TO 1} SCREEN POSITION
1298//  xy
1299// GET XY INTO {-1 TO 1} NDC [2 FMA, CANNOT FACTOR, NEED AT END]
1300//  x=x*2-1
1301//  y=y*2-1
1302// HAVE INPUT {0 TO INF} DEPTH
1303//  z
1304// GET FROM {XY NDC, DEPTH} TO 3D VIEW POSITION [4 FMA]
1305//   ... have {X,Y,z}
1306//   ... xx=(x*(z*g+h))*(1/a) + z*(e/a)
1307//   ... yy=(y*(z*g+h))*(1/b) + z*(f/b)
1308//   ... xx=x*((z*g+h)/a) + z*(e/a)
1309//   ... yy=y*((z*g+h)/b) + z*(f/b)
1310//   ... xx=x*(z*(g/a)+(h/a)) + z*(e/a)
1311//   ... yy=y*(z*(g/b)+(h/b)) + z*(f/b)
1312//  xx=x*(z*k0+k1)+z*k2
1313//  yy=y*(z*k3+k4)+z*k5
1314// TRANSFORM TO NEW VIEW
1315//  xxx=xx*i+yy*j+z*k+l
1316//  yyy=xx*m+yy*n+z*o+p
1317//  zzz=xx*q+yy*r+z*s+t
1318// PROJECTION [9 FMA]
1319//  xxxx=xxx*a+zzz*e
1320//   ... xxxx=xx*(i*a)+yy*(j*a)+z*(k*a)+(l*a) + xx*(q*e)+yy*(r*e)+z*(s*e)+(t*e)
1321//   ... xxxx=xx*k6+yy*k7+z*k8+k9
1322//  yyyy=yyy*b+zzz*f
1323//   ... yyyy=xx*(m*b)+yy*(n*b)+z*(o*b)+(p*b) + xx*(q*f)+yy*(r*f)+z*(s*f)+(t*f)
1324//   ... yyyy=xx*kA+yy*kB+z*kC+kD
1325//  wwww=zzz*g+h
1326//   ... wwww=xx*(q*g)+yy*(r*g)+z*(s*g)+(t*g+h)
1327//   ... wwww=xx*kE+yy*kF+z*kG+kH
1328// PERSPECTIVE DIVIDE [1 RCP]
1329//  xxxxx=xxxx/wwww
1330//  yyyyy=yyyy/wwww
1331// SUBTRACT TO GET 2X MOTION [2 FMA]
1332//  u=xxxxx-x ... u=xxxx*(1/wwww)-x
1333//  v=yyyyy-y ... v=yyyy*(1/wwww)-y
1334// CONSTANTS (SEE BELOW FOR MEANING OF VARIABLES)
1335//  k0=g/a ... Constants {a,b,c,d,e,f,g,h} for prior projection
1336//  k1=h/a
1337//  k2=e/a
1338//  k3=g/b
1339//  k4=h/b
1340//  k5=f/b
1341//  k6=(i*a)+(q*e) ... Constants {a,b,c,d,e,f,g,h} for next projection
1342//  k7=(j*a)+(r*e)
1343//  k8=(k*a)+(s*e)
1344//  k9=(l*a)+(t*e)
1345//  kA=(m*b)+(q*f)
1346//  kB=(n*b)+(r*f)
1347//  kC=(o*b)+(s*f)
1348//  kD=(p*b)+(t*f)
1349//  kE=q*g
1350//  kF=r*g
1351//  kG=s*g
1352//  kH=t*g+h
1353//------------------------------------------------------------------------------------------------------------------------------
1354// BACKWARD PROJECTION LOGIC
1355// =========================
1356//  This starts from '3D VIEW POSITION' of 'FORWARD PROJECTION LOGIC', but with different constants.
1357// TRANSFORM TO NEW VIEW
1358//  xxx=xx*i+yy*j+z*k+l
1359//  yyy=xx*m+yy*n+z*o+p
1360//  zzz=xx*q+yy*r+z*s+t
1361// PROJECTION [9 FMA]
1362//  xxxx=xxx*a+zzz*e
1363//   ..... xxxx=xx*(i*a)+yy*(j*a)+z*(k*a)+(l*a) + xx*(q*e)+yy*(r*e)+z*(s*e)+(t*e)
1364//   ..... xxxx=xx*kI+yy*kJ+z*kK+kJL
1365//  yyyy=yyy*b+zzz*f
1366//   ..... yyyy=xx*(m*b)+yy*(n*b)+z*(o*b)+(p*b) + xx*(q*f)+yy*(r*f)+z*(s*f)+(t*f)
1367//   ..... yyyy=xx*kM+yy*kN+z*kO+kP
1368//  wwww=zzz*g+h
1369//   ... wwww=xx*(q*g)+yy*(r*g)+z*(s*g)+(t*g+h)
1370//   ... wwww=xx*kQ+yy*kR+z*kS+kT
1371// PERSPECTIVE DIVIDE [1 RCP]
1372//  xxxxx=xxxx/wwww
1373//  yyyyy=yyyy/wwww
1374// SUBTRACT TO GET 2X MOTION [2 FMA]
1375//  u=xxxxx-x ... u=xxxx*(1/wwww)-x
1376//  v=yyyyy-y ... v=yyyy*(1/wwww)-y
1377// CONSTANTS (SEE BELOW FOR MEANING OF VARIABLES)
1378//   ... Constants {a,b,c,d,e,f,g,h} for previous prior projection
1379//   ... Constants {i,j,k,l,m,n,o,p,q,r,s,t} for prior back projection
1380//  kI=(i*a)+(q*e)
1381//  kJ=(j*a)+(r*e)
1382//  kK=(k*a)+(s*e)
1383//  kL=(l*a)+(t*e)
1384//  kM=(m*b)+(q*f)
1385//  kN=(n*b)+(r*f)
1386//  kO=(o*b)+(s*f)
1387//  kP=(p*b)+(t*f)
1388//  kQ=q*g
1389//  kR=r*g
1390//  kS=s*g
1391//  kT=t*g+h
1392//==============================================================================================================================
1393#if defined(STP_GPU)
1394    // Generates forward {-1 to 1} NDC forward projection vectors given (from prior frame),
1395    //  p .... {0 to 1} screen position
1396    //  z .... {0 to INF} depth
1397    //  m .... {0 to 1} prior motion vector
1398    // The results are approximately corrected for dynamic motion.
1399    // This takes 'dynamicMotion = priorMotionVector - priorStaticGeometryBackprojection'
1400    // Then adds that estimate of dynamic motion to the static geometry forward projection vector.
1401    StpF2 StpFor(StpF2 p, StpF1 z, StpF2 m, StpF1 kMotionMatch,
1402    StpF4 k0123, StpF4 k4567, StpF4 k89AB, StpF4 kCDEF, StpF4 kGHIJ, StpF4 kKLMN, StpF4 kOPQR, StpF2 kST,
1403    out StpF2 bugF, out StpF2 bugD){
1404        // Implements the logic described above in the comments.
1405        p = p * StpF2_(2.0) - StpF2_(1.0);
1406        StpF2 q;
1407        q.x = p.x * (z * k0123.x + k0123.y) + (z * k0123.z);
1408        q.y = p.y * (z * k0123.w + k4567.x) + (z * k4567.y);
1409        StpF3 v;
1410        v.x = q.x * k4567.z + q.y * k4567.w + z * k89AB.x + k89AB.y;
1411        v.y = q.x * k89AB.z + q.y * k89AB.w + z * kCDEF.x + kCDEF.y;
1412        v.z = q.x * kCDEF.z + q.y * kCDEF.w + z * kGHIJ.x + kGHIJ.y;
1413        v.z = StpRcpF1(v.z);
1414        StpF3 v2;
1415        v2.x = q.x * kGHIJ.z + q.y * kGHIJ.w + z * kKLMN.x + kKLMN.y;
1416        v2.y = q.x * kKLMN.z + q.y * kKLMN.w + z * kOPQR.x + kOPQR.y;
1417        v2.z = q.x * kOPQR.z + q.y * kOPQR.w + z *   kST.x +   kST.y;
1418        v2.z = StpRcpF1(v2.z);
1419        // Motion vector points forward (to estimated position in next frame).
1420        // Negative motion vector points back to where the pixel was in the prior frame.
1421        // Motion vector is {0 to 1} for one screen, but this logic is {-1 to 1} based (hence a 2x scaling).
1422        bugF = (v.xy * StpF2_(v.z) - p); // Static forward estimate.
1423        bugD = ((StpF2_(2.0) * m) - (p - v2.xy * StpF2_(v2.z))) * StpF2_(kMotionMatch); // Dynamic estimate.
1424        return bugF + bugD; }
1425#endif // defined(STP_GPU)
1426////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1427////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1428//_____________________________________________________________.._______________________________________________________________
1429//==============================================================================================================================
1430//                                                    MOTION VECTOR ENCODING
1431//------------------------------------------------------------------------------------------------------------------------------
1432// {MSB 10-bit depth, LSB {11,11}-bit motion with sqrt() encoding}
1433// Motion is encoding in sqrt() space.
1434//------------------------------------------------------------------------------------------------------------------------------
1435// 11111111111111110000000000000000
1436// fedcba9876543210fedcba9876543210
1437// ================================
1438// zzzzzzzzzz...................... 10-bit encoded z
1439// ..........yyyyyyyyyyy........... 11-bit {-1 to <1} y encoded in gamma 2.0 (sqrt)
1440// .....................xxxxxxxxxxx 11-bit {-1 to <1} x encoded in gamma 2.0 (sqrt)
1441//------------------------------------------------------------------------------------------------------------------------------
1442// The 32-bit path is 8 ops to decode {x,y}.
1443//------------------------------------------------------------------------------------------------------------------------------
1444// There once was a 16-bit path which takes 6 ops to decode (bit extra because ABS isn't free).
1445//     hhhhhhhhhhhhhhhhllllllllllllllll
1446//     ================================
1447//     zzzzzzzzzzyyyyyyyyyyyxxxxxxxxxxx  input
1448//     zzzzzyyyyyyyyyyyxxxxxxxxxxx00000  << 5
1449//     00000yyyyyyyyyyyxxxxxxxxxxx00000  & 0x7FFFFFF
1450//     00000yyyyyyyyyyy00000xxxxxxxxxxx  >> 5 (for 16-bit LSB only)
1451// This gets 11-bit integers which perfectly alias lowest non-denormal and denormals of FP16.
1452// Can scale by '16384' and subtract 1 to decompress without a CVT.
1453//==============================================================================================================================
1454#if defined(STP_GPU)
1455    // The 'z' comes in {0 to 1}.
1456    // This depends on 'v' ranging inside and including {-1 to 1}.
1457    StpU1 StpMvPack(StpF1 z, StpF2 v, StpF1 dit) {
1458        // {-1 to 1} linear to gamma 2.0 {-1 to 1}
1459        #if STP_DITHER_MOTION
1460           v = StpCpySgnF2(StpSatF2(sqrt(abs(v)) + StpF2_(dit * StpF1_(1.0 / 1024.0) - StpF1_(0.5 / 1024.0))), v);
1461        #else
1462           v = StpCpySgnF2(sqrt(abs(v)), v);
1463        #endif
1464        // Limit to {-1024/1024 to 1023/1024}.
1465        v = min(v, StpF2_(1023.0/1024.0));
1466        // Encode to 11-bit with zero at center of one step.
1467        v = v * StpF2_(1024.0) + StpF2_(1024.0);
1468        // Pack.
1469        return (StpU1(z * StpF1(1023.0)) << StpU1(22)) + (StpU1(v.y) << StpU1(11)) + StpU1(v.x); }
1470//------------------------------------------------------------------------------------------------------------------------------
1471    // Unpacks all.
1472    void StpMvUnpack(out StpF1 z, out StpF2 v, StpU1 i) {
1473        StpU1 iz = StpBfeU1(i, 22u, 10u);
1474        StpU1 iy = StpBfeU1(i, 11u, 11u);
1475        StpU1 ix = StpBfeU1(i, 0, 11u);
1476        z = StpF1(iz) * StpF1_(1.0 / 1023.0);
1477        v.y = StpF1(iy) * StpF1_(1.0 / 1024.0) + StpF1_(-1.0);
1478        v.x = StpF1(ix) * StpF1_(1.0 / 1024.0) + StpF1_(-1.0);
1479        v *= abs(v); }
1480//------------------------------------------------------------------------------------------------------------------------------
1481    // Unpack just velocity.
1482    void StpMvUnpackV(out StpF2 v, StpU1 i) {
1483        StpU1 iy = StpBfeU1(i, 11u, 11u);
1484        StpU1 ix = StpBfeU1(i, 0, 11u);
1485        v.y = StpF1(iy) * StpF1_(1.0 / 1024.0) + StpF1_(-1.0);
1486        v.x = StpF1(ix) * StpF1_(1.0 / 1024.0) + StpF1_(-1.0);
1487        v *= abs(v); }
1488#endif // defined(STP_GPU)
1489////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1490////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1491//_____________________________________________________________.._______________________________________________________________
1492//==============================================================================================================================
1493//                                                       COLOR CONVERSION
1494//==============================================================================================================================
1495#if defined(STP_GPU)
1496    // Scaling in the reversible tonemapper (should be >= 1).
1497    // Getting too close to 1.0 will result in luma inversions in highly saturated content in the oldest algorithm.
1498    // Using 4.0 or ideally 8.0 is recommended.
1499    #define STP_SAT 4.0
1500#endif // defined(STP_GPU)
1501//==============================================================================================================================
1502#if defined(STP_GPU) && defined(STP_32BIT)
1503    void StpToneF1(inout StpF1 x) { StpF1 y = StpRcpF1(StpF1_(STP_SAT) + x); x = StpSatF1(x * StpF1_(y)); }
1504//------------------------------------------------------------------------------------------------------------------------------
1505    // Reversible tonemapper.
1506    void StpToneF3(inout StpF3 x) {
1507        StpF1 y = StpRcpF1(StpF1_(STP_SAT) + StpMax3F1(x.r, x.g, x.b));
1508        x = StpSatF3(x * StpF3_(y)); }
1509//------------------------------------------------------------------------------------------------------------------------------
1510    void StpToneInvF3(inout StpF3 x) {
1511        StpF1 y = StpRcpF1(
1512            //               |-----| <- Using 32768.0 causes problems in Unity with bloom on at least some platforms.
1513            //               |     |    So output maximum is 16384 for StpToneInvF3().
1514            max(StpF1_(1.0 / 16384.0), StpSatF1(StpF1_(1.0 / STP_SAT) - StpMax3F1(x.r, x.g, x.b) * StpF1_(1.0 / STP_SAT))));
1515        x *= StpF3_(y); }
1516//------------------------------------------------------------------------------------------------------------------------------
1517    // This is currently unused but left in for reference.
1518    // Convert LDR RGB to Gamma 2.0 RGB {0 to 1}.
1519    // This is for storage to 8-bit.
1520    // This is temporal dithered.
1521    // Unoptimized logic (for reference).
1522    //     StpF3 n = sqrt(c);
1523    //     n = floor(n * StpF3_(255.0)) * StpF3_(1.0 / 255.0);
1524    //     StpF3 a = n * n;
1525    //     StpF3 b = n + StpF3_(1.0 / 255.0); b = b * b;
1526    //     // Ratio of 'a' to 'b' required to produce 'c'.
1527    //     StpF3 r = (c - b) * StpRcpF3(a - b);
1528    //     // Use the ratio as a cutoff to choose 'a' or 'b'.
1529    //     c = StpSatF3(n + StpGtZeroF3(StpF3_(dit) - r) * StpF3_(1.0 / 255.0));
1530    // Optimized from 57 to 42 clks on GCN.
1531    StpF3 StpRgbGamDit8F3(StpF3 c, StpF1 dit) {
1532        StpF3 n = sqrt(c);
1533        n = floor(n * StpF3_(255.0)) * StpF3_(1.0 / 255.0);
1534        StpF3 a = n * n;
1535        StpF3 b = n + StpF3_(1.0 / 255.0);
1536        c = StpSatF3(n + StpGtZeroF3(StpF3_(dit) * (b * b - a) - (b * b - c)) * StpF3_(1.0 / 255.0)); return c; }
1537//------------------------------------------------------------------------------------------------------------------------------
1538    // This is currently unused but left in for reference.
1539    // Version for 10-bit for feedback.
1540    StpF3 StpRgbGamDit10F3(StpF3 c, StpF1 dit) {
1541        StpF3 n = sqrt(c);
1542        n = floor(n * StpF3_(1023.0)) * StpF3_(1.0 / 1023.0);
1543        StpF3 a = n * n;
1544        StpF3 b = n + StpF3_(1.0 / 1023.0);
1545        c = StpSatF3(n + StpGtZeroF3(StpF3_(dit) * (b * b - a) - (b * b - c)) * StpF3_(1.0 / 1023.0)); return c; }
1546//------------------------------------------------------------------------------------------------------------------------------
1547    // Can use this function to convert feedback back to color.
1548    void StpFeed2ClrF(inout StpF3 c) {
1549        c *= c;
1550        #if (STP_POSTMAP == 0)
1551            StpToneInvF3(c.rgb);
1552        #endif
1553    }
1554#endif // defined(STP_GPU) && defined(STP_32BIT)
1555//==============================================================================================================================
1556#if defined(STP_GPU) && defined(STP_32BIT)
1557    void StpToneMF1(inout StpMF1 x) { StpMF1 y = StpRcpMF1(StpMF1_(STP_SAT) + x); x = StpSatMF1(x * StpMF1_(y)); }
1558//------------------------------------------------------------------------------------------------------------------------------
1559    void StpToneMF3(inout StpMF3 x) {
1560        StpMF1 y = StpRcpMF1(StpMF1_(STP_SAT) + StpMax3MF1(x.r, x.g, x.b));
1561        x = StpSatMF3(x * StpMF3_(y)); }
1562//------------------------------------------------------------------------------------------------------------------------------
1563    void StpToneInvMF3(inout StpMF3 x) {
1564        StpMF1 y = StpRcpMF1(
1565            max(StpMF1_(1.0 / 16384.0), StpSatMF1(StpMF1_(1.0 / STP_SAT) -
1566                StpMax3MF1(x.r, x.g, x.b) * StpMF1_(1.0 / STP_SAT))));
1567        x *= StpMF3_(y); }
1568//------------------------------------------------------------------------------------------------------------------------------
1569    StpMF3 StpRgbGamDit8MF3(StpMF3 c, StpMF1 dit) {
1570        StpMF3 n = sqrt(c);
1571        n = floor(n * StpMF3_(255.0)) * StpMF3_(1.0 / 255.0);
1572        StpMF3 a = n * n;
1573        StpMF3 b = n + StpMF3_(1.0 / 255.0);
1574        c = StpSatMF3(n + StpGtZeroMF3(StpMF3_(dit) * (b * b - a) - (b * b - c)) * StpMF3_(1.0 / 255.0)); return c; }
1575//------------------------------------------------------------------------------------------------------------------------------
1576    StpMF3 StpRgbGamDit10MF3(StpMF3 c, StpMF1 dit) {
1577        StpMF3 n = sqrt(c);
1578        n = floor(n * StpMF3_(1023.0)) * StpMF3_(1.0 / 1023.0);
1579        StpMF3 a = n * n;
1580        StpMF3 b = n + StpMF3_(1.0 / 1023.0);
1581        c = StpSatMF3(n + StpGtZeroMF3(StpMF3_(dit) * (b * b - a) - (b * b - c)) * StpMF3_(1.0 / 1023.0)); return c; }
1582//------------------------------------------------------------------------------------------------------------------------------
1583    void StpFeed2ClrMF(inout StpMF3 c) {
1584        c *= c;
1585        #if (STP_POSTMAP == 0)
1586            StpToneInvMF3(c.rgb);
1587        #endif
1588    }
1589#endif // defined(STP_GPU) && defined(STP_32BIT)
1590//==============================================================================================================================
1591#if defined(STP_GPU) && defined(STP_16BIT)
1592    void StpToneH1(inout StpH1 x) { StpH1 y = StpRcpH1(StpH1_(STP_SAT) + x); x = StpSatH1(x * StpH1_(y)); }
1593//------------------------------------------------------------------------------------------------------------------------------
1594    void StpToneH3(inout StpH3 x) {
1595        StpH1 y = StpRcpH1(StpH1_(STP_SAT) + StpMax3H1(x.r, x.g, x.b));
1596        x = StpSatH3(x * StpH3_(y)); }
1597//------------------------------------------------------------------------------------------------------------------------------
1598    void StpToneInvH3(inout StpH3 x) {
1599        StpH1 y = StpRcpH1(
1600            max(StpH1_(1.0 / 16384.0), StpSatH1(StpH1_(1.0 / STP_SAT) - StpMax3H1(x.r, x.g, x.b) * StpH1_(1.0 / STP_SAT))));
1601        x *= StpH3_(y); }
1602//------------------------------------------------------------------------------------------------------------------------------
1603    StpH3 StpRgbGamDit8H3(StpH3 c, StpH1 dit) {
1604        StpH3 n = sqrt(c);
1605        n = floor(n * StpH3_(255.0)) * StpH3_(1.0 / 255.0);
1606        StpH3 a = n * n;
1607        StpH3 b = n + StpH3_(1.0 / 255.0);
1608        c = StpSatH3(n + StpGtZeroH3(StpH3_(dit) * (b * b - a) - (b * b - c)) * StpH3_(1.0 / 255.0)); return c; }
1609//------------------------------------------------------------------------------------------------------------------------------
1610    StpH3 StpRgbGamDit10H3(StpH3 c, StpH1 dit) {
1611        StpH3 n = sqrt(c);
1612        n = floor(n * StpH3_(1023.0)) * StpH3_(1.0 / 1023.0);
1613        StpH3 a = n * n;
1614        StpH3 b = n + StpH3_(1.0 / 1023.0);
1615        c = StpSatH3(n + StpGtZeroH3(StpH3_(dit) * (b * b - a) - (b * b - c)) * StpH3_(1.0 / 1023.0)); return c; }
1616//------------------------------------------------------------------------------------------------------------------------------
1617    void StpFeed2ClrH(inout StpH3 c) {
1618        c *= c;
1619        #if (STP_POSTMAP == 0)
1620            StpToneInvH3(c.rgb);
1621        #endif
1622    }
1623#endif // defined(STP_GPU) && defined(STP_16BIT)
1624////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1625////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1626//_____________________________________________________________.._______________________________________________________________
1627//==============================================================================================================================
1628//                                                   COLOR CONVERSION TOOLS
1629//------------------------------------------------------------------------------------------------------------------------------
1630// Some platforms do not have a hardware sRGB image store (requires manual conversion).
1631//==============================================================================================================================
1632#if defined(STP_GPU) && defined(STP_32BIT)
1633    StpF3 StpLinearToSrgbF3(StpF3 c) {
1634        StpF3 j = StpF3(0.0031308 * 12.92, 12.92, 1.0 / 2.4); StpF2 k = StpF2(1.055, -0.055);
1635        return clamp(j.xxx, c * j.yyy, pow(c, j.zzz) * k.xxx + k.yyy); }
1636//------------------------------------------------------------------------------------------------------------------------------
1637    StpMF3 StpLinearToSrgbMF3(StpMF3 c) {
1638        StpMF3 j = StpMF3(0.0031308 * 12.92, 12.92, 1.0 / 2.4); StpMF2 k = StpMF2(1.055, -0.055);
1639        return clamp(j.xxx, c * j.yyy, pow(c, j.zzz) * k.xxx + k.yyy); }
1640#endif // defined(STP_GPU) && defined(STP_32BIT)
1641//==============================================================================================================================
1642#if defined(STP_GPU) && defined(STP_16BIT)
1643    StpH3 StpLinearToSrgbH3(StpH3 c) {
1644        StpH3 j = StpH3(0.0031308 * 12.92, 12.92, 1.0 / 2.4); StpH2 k = StpH2(1.055, -0.055);
1645        return clamp(j.xxx, c * j.yyy, pow(c, j.zzz) * k.xxx + k.yyy); }
1646#endif // defined(STP_GPU) && defined(STP_16BIT)
1647////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1648////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1649//_____________________________________________________________.._______________________________________________________________
1650//==============================================================================================================================
1651//                                                         DEBUG COMMON
1652//==============================================================================================================================
1653#if defined(STP_GPU) && STP_BUG
1654    void StpBugF(StpU3 p, StpF4 c);
1655#endif // defined(STP_GPU) && STP_BUG
1656////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1657////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1658//_____________________________________________________________.._______________________________________________________________
1659//==============================================================================================================================
1660//                                                     CONSTANT GENERATION
1661//==============================================================================================================================
1662STP_STATIC void StpDilCon(
1663// Generated constants.
1664StpInOutU4 con0,
1665// Current image resolution in pixels.
1666StpInF2 imgC) {
1667    // StpF2 kRcpR := 4/size of current input image in pixels.
1668    con0[0] = StpU1_F1(StpF1_(4.0) / imgC[0]);
1669    con0[1] = StpU1_F1(StpF1_(4.0) / imgC[1]);
1670    // StpU2 kR := size/4 of the current input image in pixels.
1671    // Used for pass merging (DIL and SAA), since convergence is 1/16 area of input, must check position.
1672    con0[2] = StpU1_(StpU1_(imgC[0]) >> StpU1_(2));
1673    con0[3] = StpU1_(StpU1_(imgC[1]) >> StpU1_(2)); }
1674//==============================================================================================================================
1675STP_STATIC void StpPatCon(
1676// Generated constants.
1677StpInOutU4 con0,
1678StpInOutU4 con1,
1679StpInOutU4 con2,
1680StpInOutU4 con3,
1681StpInOutU4 con4,
1682StpInOutU4 con5,
1683StpInOutU4 con6,
1684StpInOutU4 con7,
1685StpInOutU4 con8,
1686StpInOutU4 con9,
1687StpInOutU4 conA,
1688StpInOutU4 conB,
1689StpInOutU4 conC,
1690// Linear depth near plane for log2 depth encoding.
1691StpF1 near,
1692// Linear depth far plane for log2 depth encoding.
1693StpF1 far,
1694// Frame count for current frame (sets jitter).
1695StpU1 frame,
1696// Current image resolution in pixels.
1697StpInF2 imgC,
1698// Prior image resolution in pixels.
1699StpInF2 imgP,
1700// Feedback (aka output) resolution in pixels.
1701StpInF2 imgF,
1702// Ratio of 'currentFrameTime/priorFrameTime'.
1703StpF1 motionMatch,
1704// Projection matrix data {a,b,c,d,e,f,g,h}.
1705// This is used to do static geometry forward projection.
1706//  a 0 e 0
1707//  0 b f 0
1708//  0 0 c d
1709//  0 0 g h
1710// For reference, an DX ortho projection would be,
1711//  a 0 e 0
1712//  0 b f 0
1713//  0 0 c d
1714//  0 0 0 1
1715// And a DX, left handed perspective projection would be,
1716//  a 0 e 0
1717//  0 b f 0
1718//  0 0 c d ... c := F/(F-N), d := -(F*N)/(F-N)
1719//  0 0 1 0
1720// Previous prior projection.
1721StpInF4 prjPrvABEF,
1722StpInF4 prjPrvCDGH,
1723// Prior projection.
1724StpInF4 prjPriABEF,
1725StpInF4 prjPriCDGH,
1726// Current projection (the difference enables changing zoom).
1727StpInF4 prjCurABEF,
1728StpInF4 prjCurCDGH,
1729// Forward viewspace transform.
1730// Transform prior 3D view position into current 3D view position.
1731// This is used to do static geometry forward projection.
1732//  X := x*i + y*j +z*k +l
1733//  Y := x*m + y*n +z*o +p
1734//  Z := x*q + y*r +z*s +t
1735//  W := 1
1736//   i j k l
1737//   m n o p
1738//   q r s t
1739//   0 0 0 1
1740StpInF4 forIJKL,
1741StpInF4 forMNOP,
1742StpInF4 forQRST,
1743// Prior frame backward viewspace transform.
1744// Transform prior 3D view position into previous-prior 3D view position.
1745// This is used to 'fix' static geometry forward projection for dynamic motion.
1746//  X := x*i + y*j +z*k +l
1747//  Y := x*m + y*n +z*o +p
1748//  Z := x*q + y*r +z*s +t
1749//  W := 1
1750//   i j k l
1751//   m n o p
1752//   q r s t
1753//   0 0 0 1
1754StpInF4 bckIJKL,
1755StpInF4 bckMNOP,
1756StpInF4 bckQRST) {
1757//------------------------------------------------------------------------------------------------------------------------------
1758    // StpF2 kRcpC := 1.0 / size of current input image in pixels.
1759    con0[0] = StpU1_F1(StpF1_(1.0) / imgC[0]);
1760    con0[1] = StpU1_F1(StpF1_(1.0) / imgC[1]);
1761    // StpF2 kHalfRcpC := 0.5 / size of current input image in pixels.
1762    con0[2] = StpU1_F1(StpF1_(0.5) / imgC[0]);
1763    con0[3] = StpU1_F1(StpF1_(0.5) / imgC[1]);
1764//------------------------------------------------------------------------------------------------------------------------------
1765    // Grab jitter for current and prior frames.
1766    StpVarF2 jitP;
1767    StpVarF2 jitC;
1768    StpJit(jitP, frame - StpU1_(1));
1769    StpJit(jitC, frame);
1770    // StpF2 kJitCRcpCUnjitPRcpP := Map current into prior frame.
1771    con1[0] = StpU1_F1(jitC[0] / imgC[0] - jitP[0] / imgP[0]);
1772    con1[1] = StpU1_F1(jitC[1] / imgC[1] - jitP[1] / imgP[1]);
1773    // StpF2 kJitCRcpC := Take {0 to 1} position in current image, and map back to {0 to 1} position in feedback (removes jitter).
1774    con1[2] = StpU1_F1(jitC[0] / imgC[0]);
1775    con1[3] = StpU1_F1(jitC[1] / imgC[1]);
1776//------------------------------------------------------------------------------------------------------------------------------
1777    // StpF2 kF := size of feedback (aka output) in pixels.
1778    con2[0] = StpU1_F1(imgF[0]);
1779    con2[1] = StpU1_F1(imgF[1]);
1780    // StpF2 kDepth := Copied logic from StpZCon().
1781    StpF1 k0 = StpRcpF1(near);
1782    StpF1 k1 = StpRcpF1(StpLog2F1(k0 * far));
1783    con2[2] = StpU1_F1(k0);
1784    con2[3] = StpU1_F1(k1);
1785//------------------------------------------------------------------------------------------------------------------------------
1786    // StpF4 kOS := Scale and bias to check for out of bounds (and kill feedback).
1787    // Scaled and biased output needs to {-1 out of bounds, >-1 in bounds, <1 in bounds, 1 out of bounds}.
1788    StpVarF2 s;
1789    // Undo 'pM' scaling, and multiply by 2 (as this needs to be -1 to 1 at edge of acceptable reprojection).
1790    s[0] = StpF1_(2.0);
1791    s[1] = StpF1_(2.0);
1792    // Scaling to push outside safe reprojection over 1.
1793    s[0] *= imgP[0] / (imgP[0] + StpF1_(4.0));
1794    s[1] *= imgP[1] / (imgP[1] + StpF1_(4.0));
1795    con3[0] = StpU1_F1(s[0]);
1796    con3[1] = StpU1_F1(s[1]);
1797    // Factor out subtracting off the mid point scaled by the multiply term.
1798    con3[2] = StpU1_F1(StpF1_(-0.5) * s[0]);
1799    con3[3] = StpU1_F1(StpF1_(-0.5) * s[1]);
1800//------------------------------------------------------------------------------------------------------------------------------
1801    // StpF2 kUnDepth := Copied logic from StpZUnCon().
1802    con4[0] = StpU1_F1(StpLog2F1(far * StpRcpF1(near)));
1803    con4[1] = StpU1_F1(near);
1804    // kMotionMatch
1805    con4[2] = StpU1_F1(motionMatch);
1806    // Unused for now.
1807    con4[3] = StpU1_(0);
1808//------------------------------------------------------------------------------------------------------------------------------
1809    // StpF2 kC := Size of current input image in pixels.
1810    con5[0] = StpU1_F1(imgC[0]);
1811    con5[1] = StpU1_F1(imgC[1]);
1812    // kST
1813    con5[2] = StpU1_F1(bckQRST.z * prjPrvCDGH.z);
1814    con5[3] = StpU1_F1(bckQRST.w * prjPrvCDGH.z + prjPrvCDGH.w);
1815//------------------------------------------------------------------------------------------------------------------------------
1816    // See header docs in "STATIC GEOMETRY MOTION FORWARD PROJECTION".
1817    // k0123
1818    con6[0] = StpU1_F1(prjPriCDGH.z / prjPriABEF.x);
1819    con6[1] = StpU1_F1(prjPriCDGH.w / prjPriABEF.x);
1820    con6[2] = StpU1_F1(prjPriABEF.z / prjPriABEF.x);
1821    con6[3] = StpU1_F1(prjPriCDGH.z / prjPriABEF.y);
1822    // k4567
1823    con7[0] = StpU1_F1(prjPriCDGH.w / prjPriABEF.y);
1824    con7[1] = StpU1_F1(prjPriABEF.w / prjPriABEF.y);
1825    con7[2] = StpU1_F1(forIJKL.x * prjCurABEF.x + forQRST.x * prjCurABEF.z);
1826    con7[3] = StpU1_F1(forIJKL.y * prjCurABEF.x + forQRST.y * prjCurABEF.z);
1827    // k89AB
1828    con8[0] = StpU1_F1(forIJKL.z * prjCurABEF.x + forQRST.z * prjCurABEF.z);
1829    con8[1] = StpU1_F1(forIJKL.w * prjCurABEF.x + forQRST.w * prjCurABEF.z);
1830    con8[2] = StpU1_F1(forMNOP.x * prjCurABEF.y + forQRST.x * prjCurABEF.w);
1831    con8[3] = StpU1_F1(forMNOP.y * prjCurABEF.y + forQRST.y * prjCurABEF.w);
1832    // kCDEF
1833    con9[0] = StpU1_F1(forMNOP.z * prjCurABEF.y + forQRST.z * prjCurABEF.w);
1834    con9[1] = StpU1_F1(forMNOP.w * prjCurABEF.y + forQRST.w * prjCurABEF.w);
1835    con9[2] = StpU1_F1(forQRST.x * prjCurCDGH.z);
1836    con9[3] = StpU1_F1(forQRST.y * prjCurCDGH.z);
1837    // kGHIJ
1838    conA[0] = StpU1_F1(forQRST.z * prjCurCDGH.z);
1839    conA[1] = StpU1_F1(forQRST.w * prjCurCDGH.z + prjCurCDGH.w);
1840    conA[2] = StpU1_F1(bckIJKL.x * prjPrvABEF.x + bckQRST.x * prjPrvABEF.z);
1841    conA[3] = StpU1_F1(bckIJKL.y * prjPrvABEF.x + bckQRST.y * prjPrvABEF.z);
1842    // kKLMN
1843    conB[0] = StpU1_F1(bckIJKL.z * prjPrvABEF.x + bckQRST.z * prjPrvABEF.z);
1844    conB[1] = StpU1_F1(bckIJKL.w * prjPrvABEF.x + bckQRST.w * prjPrvABEF.z);
1845    conB[2] = StpU1_F1(bckMNOP.x * prjPrvABEF.y + bckQRST.x * prjPrvABEF.w);
1846    conB[3] = StpU1_F1(bckMNOP.y * prjPrvABEF.y + bckQRST.y * prjPrvABEF.w);
1847    // kOPQR
1848    conC[0] = StpU1_F1(bckMNOP.z * prjPrvABEF.y + bckQRST.z * prjPrvABEF.w);
1849    conC[1] = StpU1_F1(bckMNOP.w * prjPrvABEF.y + bckQRST.w * prjPrvABEF.w);
1850    conC[2] = StpU1_F1(bckQRST.x * prjPrvCDGH.z);
1851    conC[3] = StpU1_F1(bckQRST.y * prjPrvCDGH.z);}
1852//==============================================================================================================================
1853STP_STATIC void StpTaaCon(
1854// Generated constants.
1855StpInOutU4 con0,
1856StpInOutU4 con1,
1857StpInOutU4 con2,
1858StpInOutU4 con3,
1859// Amount of grain {0 = maximum, >0 is amount of stops less of grain}.
1860StpF1 grain,
1861// Frame count for current frame (sets jitter).
1862StpU1 frame,
1863// Current image resolution in pixels.
1864StpInF2 imgC,
1865// Feedback (aka output) resolution in pixels.
1866StpInF2 imgF) {
1867//------------------------------------------------------------------------------------------------------------------------------
1868    // Grab jitter for current frame.
1869    StpVarF2 jitC;
1870    StpJit(jitC, frame);
1871//------------------------------------------------------------------------------------------------------------------------------
1872    // Conversion from integer pix position to center pix float pixel position in image for current input.
1873    //  xy := multiply term (M) --- Scale by 1/imgF to get to {0 to 1}.
1874    //  zw := addition term (A) --- Add 0.5*M to get to center of pixel, then subtract jitC to undo jitter.
1875    // StpF2 kCRcpF.
1876    con0[0] = StpU1_F1(imgC[0] / imgF[0]);
1877    con0[1] = StpU1_F1(imgC[1] / imgF[1]);
1878    // StpF2 kHalfCRcpFUnjitC.
1879    con0[2] = StpU1_F1(StpF1_(0.5) * imgC[0] / imgF[0] - jitC[0]);
1880    con0[3] = StpU1_F1(StpF1_(0.5) * imgC[1] / imgF[1] - jitC[1]);
1881//------------------------------------------------------------------------------------------------------------------------------
1882    // StpF2 kRcpC := 1/size of current input image in pixels.
1883    con1[0] = StpU1_F1(StpF1_(1.0) / imgC[0]);
1884    con1[1] = StpU1_F1(StpF1_(1.0) / imgC[1]);
1885//------------------------------------------------------------------------------------------------------------------------------
1886    // StpF2 kRcpF := 1/size of feedback image (aka output) in pixels.
1887    con1[2] = StpU1_F1(StpF1_(1.0) / imgF[0]);
1888    con1[3] = StpU1_F1(StpF1_(1.0) / imgF[1]);
1889//------------------------------------------------------------------------------------------------------------------------------
1890    // StpF2 kHalfRcpF := 0.5/size of feedback image (aka output) in pixels.
1891    con2[0] = StpU1_F1(StpF1_(0.5) / imgF[0]);
1892    con2[1] = StpU1_F1(StpF1_(0.5) / imgF[1]);
1893//------------------------------------------------------------------------------------------------------------------------------
1894    // Conversion from a {0 to 1} position in current input to feedback.
1895    // StpH3 kJitCRcpC0 := jitC / image image size in pixels + {-0.5/size, +0.5/size} of current input image in pixels.
1896    con2[2] = StpU1_F1(jitC[0] / imgC[0] - StpF1_(0.5) / imgC[0]);
1897    con2[3] = StpU1_F1(jitC[1] / imgC[1] + StpF1_(0.5) / imgC[1]);
1898//------------------------------------------------------------------------------------------------------------------------------
1899    // StpF2 kHalfRcpC := 0.5/size of current input image in pixels.
1900    con3[0] = StpU1_F1(StpF1_(0.5) / imgC[0]);
1901    con3[1] = StpU1_F1(StpF1_(0.5) / imgC[1]);
1902//------------------------------------------------------------------------------------------------------------------------------
1903    // StpF2 kF := size of feedback image in pixels.
1904    con3[2] = StpU1_F1(imgF[0]);
1905    con3[3] = StpU1_F1(imgF[1]); }
1906////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1907////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1908////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1909////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1910//_____________________________________________________________.._______________________________________________________________
1911//==============================================================================================================================
1912//
1913//                                                     PATTERN ENTRY POINT
1914//
1915//==============================================================================================================================
1916// See the packed 16-bit version for comments.
1917#if defined(STP_GPU) && defined(STP_32BIT) && defined(STP_PAT)
1918    void StpPat4x4MaxF8(StpMU1 i, inout StpF4 a, inout StpF4 b);
1919    void StpPat4x4SumF4(StpMU1 i, inout StpF4 a);
1920//------------------------------------------------------------------------------------------------------------------------------
1921    StpMF1 StpPatPriConF(StpF2 p);
1922//------------------------------------------------------------------------------------------------------------------------------
1923    StpF2 StpPatDatMotF(StpMU2 o);
1924    StpMF3 StpPatDatColF(StpMU2 o);
1925    StpF1 StpPatDatZF(StpMU2 o);
1926    StpF1 StpPatFixZF(StpF1 z);
1927    StpU1 StpPatDatRF(StpMU2 o);
1928    StpMF1 StpPatFixRF(StpU1 v);
1929//------------------------------------------------------------------------------------------------------------------------------
1930    StpMF1 StpPatDitF(StpMU2 o);
1931//------------------------------------------------------------------------------------------------------------------------------
1932    StpMF4 StpPatPriFedF(StpF2 p);
1933    StpMF4 StpPatPriFedR4F(StpF2 p);
1934    StpMF4 StpPatPriFedG4F(StpF2 p);
1935    StpMF4 StpPatPriFedB4F(StpF2 p);
1936//------------------------------------------------------------------------------------------------------------------------------
1937    StpMF2 StpPatPriLumF(StpF2 p);
1938//------------------------------------------------------------------------------------------------------------------------------
1939    StpU4 StpPatPriMot4F(StpF2 p);
1940    #if STP_MAX_MIN_UINT
1941        StpU1 StpPatPriMotMinF(StpF2 p);
1942    #endif // STP_MAX_MIN_UINT
1943    #if STP_OFFSETS
1944        StpU4 StpPatPriMot4OF(StpF2 p, StpI2 o);
1945        #if STP_MAX_MIN_UINT
1946            StpU1 StpPatPriMotMinOF(StpF2 p, StpI2 o);
1947        #endif // STP_MAX_MIN_UINT
1948    #endif // STP_OFFSETS
1949//------------------------------------------------------------------------------------------------------------------------------
1950    void StpPatStMotF(StpMU2 p, StpU1 v);
1951    void StpPatStColF(StpMU2 p, StpMF4 v);
1952    void StpPatStLumF(StpMU2 p, StpMF2 v);
1953    void StpPatStCnvF(StpMU2 p, StpMF1 v);
1954//==============================================================================================================================
1955    void StpPatF(
1956    StpMU1 lane,
1957    StpMU2 pp,
1958    StpU4 con0,
1959    StpU4 con1,
1960    StpU4 con2,
1961    StpU4 con3,
1962    StpU4 con4,
1963    StpU4 con5,
1964    StpU4 con6,
1965    StpU4 con7,
1966    StpU4 con8,
1967    StpU4 con9,
1968    StpU4 conA,
1969    StpU4 conB,
1970    StpU4 conC,
1971    StpU4 conD) {
1972//------------------------------------------------------------------------------------------------------------------------------
1973        StpMF4 rC;
1974        StpU1 rM;
1975        StpMF2 rL;
1976        StpMF1 rCnv;
1977//------------------------------------------------------------------------------------------------------------------------------
1978        StpF2 kRcpC = StpF2_U2(con0.xy);
1979        StpF2 kHalfRcpC = StpF2_U2(con0.zw);
1980        StpF2 kJitCRcpCUnjitPRcpP = StpF2_U2(con1.xy);
1981        StpF2 kJitCRcpC = StpF2_U2(con1.zw);
1982        StpF2 kF = StpF2_U2(con2.xy);
1983        StpF4 kOS = StpF4_U4(con3);
1984        StpF2 kDepth = StpF2_U2(con2.zw);
1985        StpF2 kUnDepth = StpF2_U2(con4.xy);
1986        StpF1 kMotionMatch = StpF1_U1(con4.z);
1987        StpF2 kC = StpF2_U2(con5.xy);
1988        StpF4 k0123 = StpF4_U4(con6);
1989        StpF4 k4567 = StpF4_U4(con7);
1990        StpF4 k89AB = StpF4_U4(con8);
1991        StpF4 kCDEF = StpF4_U4(con9);
1992        StpF4 kGHIJ = StpF4_U4(conA);
1993        StpF4 kKLMN = StpF4_U4(conB);
1994        StpF4 kOPQR = StpF4_U4(conC);
1995        StpF2 kST = StpF2_U2(conD.xy);
1996//------------------------------------------------------------------------------------------------------------------------------
1997        StpF2 m = StpPatDatMotF(pp);
1998        StpMF1 d = StpPatDitF(pp);
1999        StpF1 zPre = StpPatDatZF(pp);
2000        StpMF3 c = StpPatDatColF(pp);
2001//==============================================================================================================================
2002//      DEPENDENT INLINE INPUT MOTION
2003//==============================================================================================================================
2004        StpF2 p = StpF2(pp) * kRcpC + kHalfRcpC;
2005//------------------------------------------------------------------------------------------------------------------------------
2006        // Check the streaming bandwidth limit.
2007        #if STP_BUG_BW_SOL
2008        {   StpMF2 lum2 = StpPatPriLumF(p);
2009            StpMF1 cnvPrev = StpPatPriConF(p);
2010            StpU4 mZVP4 = StpPatPriMot4F(p);
2011            StpU1 rPre = StpPatDatRF(p);
2012            StpMF3 f = StpPatPriFedF(p).rgb;
2013            StpF1 z = StpPatFixZF(zPre);
2014            StpMF1 r = StpPatFixRF(rPre);
2015            rC.rgb = StpMF3_(m.x) + StpMF3_(d.x) + c + StpMF3_(lum2.x) + StpMF3_(cnvPrev) + StpMF3(mZVP4.xyz) + f + StpMF3_(z+r);
2016            rC.a = StpMF1_(0.0);
2017            rL = rC.rg;
2018            rM = StpU1_(rC.r);
2019            rCnv = rC.r;
2020            StpPatStMotF(pp, rM);
2021            StpPatStLumF(pp, rL);
2022            StpPatStColF(pp, rC);
2023            StpPatStCnvF(pp, rCnv);
2024            return; }
2025        #endif // STP_BUG_BW_SOL
2026//------------------------------------------------------------------------------------------------------------------------------
2027        StpF2 pM = (p - m);
2028        StpF2 pF = pM + kJitCRcpC;
2029              pM = pM + kJitCRcpCUnjitPRcpP;
2030//------------------------------------------------------------------------------------------------------------------------------
2031        StpMF2 lum2 = StpPatPriLumF(pM);
2032//------------------------------------------------------------------------------------------------------------------------------
2033        StpMF1 cnvPrev = StpPatPriConF(pM);
2034//------------------------------------------------------------------------------------------------------------------------------
2035        #if (STP_SAFE_DILATE == 2)
2036            #if STP_MAX_MIN_UINT
2037                StpU4 mZVP4;
2038                #if STP_OFFSETS
2039                    mZVP4.x = StpPatPriMotMinOF(pM, StpI2(-1, -1));
2040                    mZVP4.y = StpPatPriMotMinOF(pM, StpI2( 1, -1));
2041                    mZVP4.z = StpPatPriMotMinOF(pM, StpI2(-1,  1));
2042                    mZVP4.w = StpPatPriMotMinOF(pM, StpI2( 1,  1));
2043                #else // STP_OFFSETS
2044                    mZVP4.x = StpPatPriMotMinF(pM + StpF2(-kRcpC.x, -kRcpC.y));
2045                    mZVP4.y = StpPatPriMotMinF(pM + StpF2( kRcpC.x, -kRcpC.y));
2046                    mZVP4.z = StpPatPriMotMinF(pM + StpF2(-kRcpC.x,  kRcpC.y));
2047                    mZVP4.w = StpPatPriMotMinF(pM + StpF2( kRcpC.x,  kRcpC.y));
2048                #endif // ST_OFFSETS
2049            #else // STP_MAX_MIN_UINT
2050                #if STP_OFFSETS
2051                    StpU4 mZVP4_0 = StpPatPriMot4OF(pM, StpI2(-1, -1));
2052                    StpU4 mZVP4_1 = StpPatPriMot4OF(pM, StpI2( 1, -1));
2053                    StpU4 mZVP4_2 = StpPatPriMot4OF(pM, StpI2(-1,  1));
2054                    StpU4 mZVP4_3 = StpPatPriMot4OF(pM, StpI2( 1,  1));
2055                #else // STP_OFFSETS
2056                    StpU4 mZVP4_0 = StpPatPriMot4F(pM + StpF2(-kRcpC.x, -kRcpC.y));
2057                    StpU4 mZVP4_1 = StpPatPriMot4F(pM + StpF2( kRcpC.x, -kRcpC.y));
2058                    StpU4 mZVP4_2 = StpPatPriMot4F(pM + StpF2(-kRcpC.x,  kRcpC.y));
2059                    StpU4 mZVP4_3 = StpPatPriMot4F(pM + StpF2( kRcpC.x,  kRcpC.y));
2060                #endif // STP_OFFSETS
2061            #endif // STP_MAX_MIN_UINT
2062        #else // (STP_SAFE_DILATE == 2)
2063            StpU1 mZVPN;
2064            StpU4 mZVP2a = StpPatPriMot4F(pM - kHalfRcpC);
2065            StpU4 mZVP2b = StpPatPriMot4F(pM + kHalfRcpC);
2066            #if STP_MAX_MIN_UINT
2067                mZVPN = StpPatPriMotMinF(pM);
2068            #else // STP_MAX_MIN_UINT
2069                StpU4 mZVP4 = StpPatPriMot4F(pM);
2070            #endif // STP_MAX_MIN_UINT
2071        #endif // (STP_SAFE_DILATE == 2)
2072//------------------------------------------------------------------------------------------------------------------------------
2073        StpU1 rPre = StpPatDatRF(pp);
2074//------------------------------------------------------------------------------------------------------------------------------
2075        StpMF4 f4R = StpPatPriFedR4F(pF);
2076        StpMF4 f4G = StpPatPriFedG4F(pF);
2077        StpMF4 f4B = StpPatPriFedB4F(pF);
2078        StpMF3 f = StpPatPriFedF(pF).rgb;
2079//==============================================================================================================================
2080//      DEPENDENT ON DITHER AND INLINE INPUT PARAMETERS
2081//==============================================================================================================================
2082        StpF1 dd = StpF1_(d);
2083        StpF1 z = StpPatFixZF(zPre);
2084        z = StpZPack(z, kDepth, dd);
2085        rM = StpMvPack(z, m, dd);
2086        StpPatStMotF(pp, rM);
2087//------------------------------------------------------------------------------------------------------------------------------
2088        #if STP_BUG
2089            // Pattern/Clipped Input Color
2090            { StpF4 bug = StpF4_(0.0);
2091                bug.rgb = sqrt(StpF3(c.rgb));
2092                bug.rgb = StpSatF3(bug.rgb + StpF3_(StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0)));
2093                StpBugF(StpU3(pp, 0), bug); }
2094//------------------------------------------------------------------------------------------------------------------------------
2095            // Pattern/Log Input Depth
2096            { StpF4 bug = StpF4_(0.0);
2097                bug.rgb = StpF3_(StpSatF1(z + StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0)));
2098                StpBugF(StpU3(pp, 1), bug); }
2099        #endif // STP_BUG
2100//------------------------------------------------------------------------------------------------------------------------------
2101        #if (STP_POSTMAP == 0)
2102            StpToneMF3(c);
2103        #endif // (STP_POSTMAP == 0)
2104//------------------------------------------------------------------------------------------------------------------------------
2105        #if STP_BUG
2106            // Pattern/Reversible Tonemapped Input Color
2107            { StpF4 bug = StpF4_(0.0);
2108                bug.rgb = sqrt(StpF3(c.rgb));
2109                bug.rgb = StpSatF3(bug.rgb + StpF3_(StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0)));
2110                StpBugF(StpU3(pp, 2), bug); }
2111        #endif // STP_BUG
2112//------------------------------------------------------------------------------------------------------------------------------
2113        c = sqrt(c);
2114        rC.rgb = StpSatMF3(c + StpMF3_(d * StpMF1(1.0 / 1023.0) + StpMF1(-0.5 / 1023.0)));
2115//------------------------------------------------------------------------------------------------------------------------------
2116        rL.x = dot(c, StpMF3(STP_LUMA));
2117        rL.y = lum2.x;
2118        StpPatStLumF(pp, rL);
2119//------------------------------------------------------------------------------------------------------------------------------
2120        #if STP_BUG
2121            // Pattern/Shaped Absolute Input Motion
2122            { StpF4 bug = StpF4_(0.0);
2123                bug.b = sqrt(StpF1_(rL.x) * StpF1_(0.25));
2124                bug.rg = StpF2_(1.0) - exp2(abs(StpF2(m)) * StpF2_(-32.0));
2125                bug.rgb = StpSatF3(bug.rgb + StpF3_(StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0)));
2126                StpBugF(StpU3(pp, 3), bug); }
2127        #endif // STP_BUG
2128//------------------------------------------------------------------------------------------------------------------------------
2129        StpMF1 moire = min(abs(rL.x - lum2.x), abs(lum2.x - lum2.y));
2130        moire *= StpMF1_(STP_PAT_DEMOIRE);
2131//------------------------------------------------------------------------------------------------------------------------------
2132        StpMF4 xnyRG = StpMF4(c.r, -c.r, c.g, -c.g);
2133        StpMF4 xnyBC = StpMF4(c.b, -c.b, -cnvPrev, -cnvPrev);
2134        #if defined(STP_16BIT)
2135        #else // defined(STP_16BIT)
2136            // We convert to full precision floats here since the reductions work on 32-bit values.
2137            StpF4 xnyRGF = StpF4(xnyRG);
2138            StpF4 xnyBCF = StpF4(xnyBC);
2139            StpPat4x4MaxF8(lane, xnyRGF, xnyBCF);
2140            xnyRG = StpMF4(xnyRGF);
2141            xnyBC = StpMF4(xnyBCF);
2142        #endif // defined(STP_16BIT)
2143        cnvPrev = -xnyBC.z;
2144        StpMF3 ne = max(StpMF3_(STP_PAT_NE_MIN) * StpMF3(xnyRG.x, xnyRG.z, xnyBC.x),
2145                       StpMF3(xnyRG.x + xnyRG.y, xnyRG.z + xnyRG.w, xnyBC.x + xnyBC.y));
2146        StpMF1 ne1 = dot(ne, StpMF3(STP_LUMA));
2147//------------------------------------------------------------------------------------------------------------------------------
2148        cnvPrev = StpSatMF1(cnvPrev + StpMF1_(1.0 / STP_FRAME_MAX));
2149//------------------------------------------------------------------------------------------------------------------------------
2150        StpF2 onXY = StpF2(pM.xy);
2151        onXY = onXY * kOS.xy + kOS.zw;
2152        StpF1 onS = StpSignedF1(max(abs(onXY.x), abs(onXY.y)) - StpF1_(1.0));
2153//------------------------------------------------------------------------------------------------------------------------------
2154        #if STP_BUG
2155            // Pattern/Motion Reprojection {R=Prior G=This Sqrt Luma Feedback Diff, B=Offscreen}
2156            { StpF4 bug = StpF4_(0.0);
2157                bug.g = StpF1_(abs(rL.x - lum2.x));
2158                bug.r = StpF1_(abs(lum2.x - lum2.y));
2159                bug.b = StpF1_(1.0) - StpF1_(onS);
2160                bug.rg = sqrt(bug.rg);
2161                bug.rgb = StpSatF3(bug.rgb + StpF3_(StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0)));
2162                StpBugF(StpU3(pp, 4), bug); }
2163        #endif // STP_BUG
2164//==============================================================================================================================
2165//      DEPENDENT ON PRIOR {Z, MOTION}
2166//==============================================================================================================================
2167        #if (STP_SAFE_DILATE == 2)
2168            #if (STP_MAX_MIN_UINT == 0)
2169                StpU4 mZVP4 = min(StpMin3U4(mZVP4_0, mZVP4_1, mZVP4_2), mZVP4_3);
2170            #endif // (STP_MAX_MIN_UINT == 0)
2171            StpU1 mZVPN = min(StpMin3U1(mZVP4.x, mZVP4.y, mZVP4.z), mZVP4.w);
2172        #else // (STP_SAFE_DILATE == 2)
2173            #if (STP_MAX_MIN_UINT == 0)
2174                mZVPN = min(StpMin3U1(mZVP4.x, mZVP4.y, mZVP4.z), mZVP4.w);
2175            #endif // (STP_MAX_MIN_UINT == 0)
2176            #if STP_SAFE_DILATE
2177                mZVPN = StpMin3U1(StpMin3U1(mZVPN, mZVP2a.x, mZVP2a.z), mZVP2b.x, mZVP2b.z);
2178            #endif // STP_SAFE_DILATE
2179        #endif // (STP_SAFE_DILATE == 2)
2180//------------------------------------------------------------------------------------------------------------------------------
2181        StpF2 mPN;
2182        StpF1 mZPN;
2183        StpMvUnpack(mZPN, mPN, mZVPN);
2184//------------------------------------------------------------------------------------------------------------------------------
2185        StpF2 mE;
2186        mE = sqrt(abs(m)) + StpF2_(1.0 / 256.0);
2187        mE = mE * mE - abs(m);
2188//------------------------------------------------------------------------------------------------------------------------------
2189        StpF1 sgZ = StpZUnpack(mZPN, kUnDepth);
2190        StpF2 bugF; StpF2 bugD;
2191        StpF2 sgM = StpFor(pM, sgZ, mPN, kMotionMatch, k0123, k4567, k89AB, kCDEF, kGHIJ, kKLMN, kOPQR, kST, bugF, bugD);
2192        sgM = StpSatF2(abs(sgM * StpF2_(0.5) - m) - mE) * kC;
2193        StpMF1 sgD = StpMF1(dot(sgM, sgM));
2194//------------------------------------------------------------------------------------------------------------------------------
2195        StpMF1 match = StpMF1_(1.0) - StpSatMF1(sgD * StpMF1_(STP_PAT_MOT_AMP) - StpMF1_(STP_PAT_MOT_ADD * STP_PAT_MOT_AMP));
2196        match *= StpMF1_(onS);
2197        rC.a = match;
2198        StpPatStColF(pp, rC);
2199//------------------------------------------------------------------------------------------------------------------------------
2200        moire = moire * match + StpMF1_(1.0 / 8192.0);
2201        moire = min(StpMF1_(1.0), ne1 * StpRcpMF1(moire));
2202//------------------------------------------------------------------------------------------------------------------------------
2203        StpMF1 tS = moire;
2204        StpMF1 r = StpPatFixRF(rPre);
2205        tS = tS * (StpMF1_(STP_PAT_RESPONSIVE) - r * StpMF1_(STP_PAT_RESPONSIVE)) + tS;
2206//------------------------------------------------------------------------------------------------------------------------------
2207        #if STP_BUG
2208            // Pattern/Sensitivity {G=No motion match, R=Responsive, B=Luma}
2209            { StpF4 bug = StpF4_(0.0);
2210                bug.g = StpF1_(1.0) - StpF1(match);
2211                bug.r = StpF1_(1.0) - StpF1(r);
2212                bug.b = StpF1_(rL.x);
2213                bug.rgb = StpSatF3(bug.rgb + StpF3_(StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0)));
2214                StpBugF(StpU3(pp, 5), bug); }
2215        #endif // STP_BUG
2216//==============================================================================================================================
2217//      DEPENDENT ON FEEDBACK
2218//==============================================================================================================================
2219        StpMF4 t;
2220        t.rgb = c - f;
2221        t.a = dot(abs(t.rgb), StpMF3(STP_LUMA));
2222        StpMF4 t4R = f4R - StpMF4_(c.r);
2223        StpMF4 t4G = f4G - StpMF4_(c.g);
2224        StpMF4 t4B = f4B - StpMF4_(c.b);
2225        StpMF4 t4A = abs(t4R) * StpMF4_(STP_LUMA_R) + abs(t4G) * StpMF4_(STP_LUMA_G) + abs(t4B) * StpMF4_(STP_LUMA_B);
2226        t.a = StpMin3MF1(t.a, t4A.x, StpMin3MF1(t4A.y, t4A.z, t4A.w));
2227        if(t.a == t4A.x) t.rgb = StpMF3(t4R.x, t4G.x, t4B.x);
2228        if(t.a == t4A.y) t.rgb = StpMF3(t4R.y, t4G.y, t4B.y);
2229        if(t.a == t4A.z) t.rgb = StpMF3(t4R.z, t4G.z, t4B.z);
2230        if(t.a == t4A.w) t.rgb = StpMF3(t4R.w, t4G.w, t4B.w);
2231//------------------------------------------------------------------------------------------------------------------------------
2232        t.rgb *= StpMF3_(tS);
2233//------------------------------------------------------------------------------------------------------------------------------
2234        #if defined(STP_16BIT)
2235            StpPat4x4SumH4(lane, t);
2236        #else // defined(STP_16BIT)
2237            // We convert to full precision floats here since the reductions work on 32-bit values, and MF might be 16-bit.
2238            StpF4 tF = StpF4(t);
2239            StpPat4x4SumF4(lane, tF);
2240            t = StpMF4(tF);
2241        #endif // defined(STP_16BIT)
2242        t.rgb *= StpMF3_(STP_PAT_SENSITIVITY);
2243//------------------------------------------------------------------------------------------------------------------------------
2244        StpMF3 bln3 = StpSatMF3(ne * StpRcpMF3(abs(t.rgb)));
2245        StpMF1 bln = StpMin3MF1(bln3.r, bln3.g, bln3.b);
2246//------------------------------------------------------------------------------------------------------------------------------
2247        StpMF1 cnv = StpSatMF1(bln * StpRcpMF1(StpMF1_(STP_FRAME_MAX) - StpMF1_(STP_FRAME_MAX) * bln));
2248//------------------------------------------------------------------------------------------------------------------------------
2249        cnv = StpSatMF1(cnv - StpMF1_(1.0 / STP_FRAME_MAX));
2250        rCnv = min(cnv, cnvPrev);
2251        StpPatStCnvF(pp, rCnv); }
2252#endif // defined(STP_GPU) && defined(STP_32BIT) && defined(STP_PAT)
2253////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2254////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2255//_____________________________________________________________.._______________________________________________________________
2256//==============================================================================================================================
2257//                                                         16-BIT PATH
2258//==============================================================================================================================
2259// See the packed 16-bit version for comments.
2260#if defined(STP_GPU) && defined(STP_16BIT) && defined(STP_PAT)
2261    // 4x4 wave op: 8 component maximum.
2262    void StpPat4x4MaxH8(StpW1 i, inout StpH4 a, inout StpH4 b);
2263    // 4x4 wave op: 4 component sum.
2264    void StpPat4x4SumH4(StpW1 i, inout StpH4 a);
2265//------------------------------------------------------------------------------------------------------------------------------
2266    // Sample bilinear interpolated clamp to edge prior convergence.
2267    StpH1 StpPatPriConH(StpF2 p);
2268//------------------------------------------------------------------------------------------------------------------------------
2269    // Note this is still designed to be an inline function pass merged to avoid DRAM traffic.
2270    // So in an ideal world (with better merging with pre-scale post) these would be already in registers.
2271    // But when PAT pass is non-inline, these callbacks are placed in the right order for loads.
2272    // Input motion, 'position - motion' is the reprojected position, where {0 to 1} is range of the screen.
2273    StpF2 StpPatDatMotH(StpW2 o);
2274    // Input color, this is linear HDR or post-tonemap-linear depending on STP_POSTMAP.
2275    StpH3 StpPatDatColH(StpW2 o);
2276    StpF1 StpPatDatZH(StpW2 o);
2277    // Input depth, this is linear {0:near to INF:far} ranged.
2278    StpF1 StpPatFixZH(StpF1 z);
2279    StpU1 StpPatDatRH(StpW2 o);
2280    // Responsive input pixel {0.0 := responsive, 1.0 := normal}.
2281    StpH1 StpPatFixRH(StpU1 v);
2282//------------------------------------------------------------------------------------------------------------------------------
2283    // Dither value {0 to 1} this should be input pixel frequency spatial temporal blue noise.
2284    StpH1 StpPatDitH(StpW2 o);
2285//------------------------------------------------------------------------------------------------------------------------------
2286    // Sample bilinear interpolated clamp to edge prior feedback.
2287    StpH4 StpPatPriFedH(StpF2 p);
2288    // Gather4 versions.
2289    StpH4 StpPatPriFedR4H(StpF2 p);
2290    StpH4 StpPatPriFedG4H(StpF2 p);
2291    StpH4 StpPatPriFedB4H(StpF2 p);
2292//------------------------------------------------------------------------------------------------------------------------------
2293    // Sample bilinear interpolated clamp to edge 2-frame luma ring.
2294    StpH2 StpPatPriLumH(StpF2 p);
2295//------------------------------------------------------------------------------------------------------------------------------
2296    // Gather4 on prior {z,motion}.
2297    StpU4 StpPatPriMot4H(StpF2 p);
2298    #if STP_MAX_MIN_UINT
2299        StpU1 StpPatPriMotMinH(StpF2 p);
2300    #endif // STP_MAX_MIN_UINT
2301    #if STP_OFFSETS
2302        StpU4 StpPatPriMot4OH(StpF2 p, StpI2 o);
2303        #if STP_MAX_MIN_UINT
2304            StpU1 StpPatPriMotMinOH(StpF2 p, StpI2 o);
2305        #endif // STP_MAX_MIN_UINT
2306    #endif // STP_OFFSETS
2307//------------------------------------------------------------------------------------------------------------------------------
2308    void StpPatStMotH(StpW2 p, StpU1 v);
2309    void StpPatStColH(StpW2 p, StpH4 v);
2310    void StpPatStLumH(StpW2 p, StpH2 v);
2311    void StpPatStCnvH(StpW2 p, StpH1 v);
2312//==============================================================================================================================
2313    void StpPatH(
2314    StpW1 lane,
2315    StpW2 pp,
2316    StpU4 con0,
2317    StpU4 con1,
2318    StpU4 con2,
2319    StpU4 con3,
2320    StpU4 con4,
2321    StpU4 con5,
2322    StpU4 con6,
2323    StpU4 con7,
2324    StpU4 con8,
2325    StpU4 con9,
2326    StpU4 conA,
2327    StpU4 conB,
2328    StpU4 conC,
2329    StpU4 conD) {
2330//------------------------------------------------------------------------------------------------------------------------------
2331        // Outputs.
2332        StpH4 rC;
2333        StpU1 rM;
2334        StpH2 rL;
2335        StpH1 rCnv;
2336//------------------------------------------------------------------------------------------------------------------------------
2337        // Rename constants.
2338        StpF2 kRcpC = StpF2_U2(con0.xy);
2339        StpF2 kHalfRcpC = StpF2_U2(con0.zw);
2340        StpF2 kJitCRcpCUnjitPRcpP = StpF2_U2(con1.xy);
2341        StpF2 kJitCRcpC = StpF2_U2(con1.zw);
2342        StpF2 kF = StpF2_U2(con2.xy);
2343        StpF4 kOS = StpF4_U4(con3);
2344        StpF2 kDepth = StpF2_U2(con2.zw);
2345        StpF2 kUnDepth = StpF2_U2(con4.xy);
2346        StpF1 kMotionMatch = StpF1_U1(con4.z);
2347        StpF2 kC = StpF2_U2(con5.xy);
2348        StpF4 k0123 = StpF4_U4(con6);
2349        StpF4 k4567 = StpF4_U4(con7);
2350        StpF4 k89AB = StpF4_U4(con8);
2351        StpF4 kCDEF = StpF4_U4(con9);
2352        StpF4 kGHIJ = StpF4_U4(conA);
2353        StpF4 kKLMN = StpF4_U4(conB);
2354        StpF4 kOPQR = StpF4_U4(conC);
2355        StpF2 kST = StpF2_U2(conD.xy);
2356//------------------------------------------------------------------------------------------------------------------------------
2357        StpF2 m = StpPatDatMotH(pp);
2358        // This dither fetch should likely be shared with pass merged pre-scale post work in the future.
2359        StpH1 d = StpPatDitH(pp);
2360        StpF1 zPre = StpPatDatZH(pp);
2361        StpH3 c = StpPatDatColH(pp);
2362//==============================================================================================================================
2363//      DEPENDENT INLINE INPUT MOTION
2364//==============================================================================================================================
2365        // Work towards getting all dependent fetches out first.
2366        // Compute float position {0 to 1} across screen.
2367        StpF2 p = StpF2(pp) * kRcpC + kHalfRcpC;
2368//------------------------------------------------------------------------------------------------------------------------------
2369        #if STP_BUG_BW_SOL
2370        {   StpH2 lum2 = StpPatPriLumH(p);
2371            StpH1 cnvPrev = StpPatPriConH(p);
2372            StpU4 mZVP4 = StpPatPriMot4H(p);
2373            StpU1 rPre = StpPatDatRH(p);
2374            StpH3 f = StpPatPriFedH(p).rgb;
2375            StpF1 z = StpPatFixZH(zPre);
2376            StpH1 r = StpPatFixRH(rPre);
2377            rC.rgb = StpH3_(m.x) + StpH3_(d.x) + c + StpH3_(lum2.x) + StpH3_(cnvPrev) + StpH3(mZVP4.xyz) + f + StpH3_(z+r);
2378            rC.a = StpH1_(0.0);
2379            rL = rC.rg;
2380            rM = StpU1_(rC.r);
2381            rCnv = rC.r;
2382            StpPatStMotH(pp, rM);
2383            StpPatStLumH(pp, rL);
2384            StpPatStColH(pp, rC);
2385            StpPatStCnvH(pp, rCnv);
2386            return; }
2387        #endif // STP_BUG_BW_SOL
2388//------------------------------------------------------------------------------------------------------------------------------
2389        // Reprojection position in prior input and feedback.
2390        StpF2 pM = (p - m);
2391        StpF2 pF = pM + kJitCRcpC;
2392              pM = pM + kJitCRcpCUnjitPRcpP;
2393//------------------------------------------------------------------------------------------------------------------------------
2394        // Fetch 2-frame reprojected history ring of luma.
2395        StpH2 lum2 = StpPatPriLumH(pM);
2396//------------------------------------------------------------------------------------------------------------------------------
2397        // Fetch reprojected low-frequency convergence prior frame.
2398        StpH1 cnvPrev = StpPatPriConH(pM);
2399//------------------------------------------------------------------------------------------------------------------------------
2400        // Grab large enough neighborhood for prior reprojected nearest {z,motion}.
2401        // This nearest dilates {z, motion} reprojection to avoid pulling in anti-aliased edges and leaving temporal ringing.
2402        #if (STP_SAFE_DILATE == 2)
2403            #if STP_MAX_MIN_UINT
2404                StpU4 mZVP4;
2405                #if STP_OFFSETS
2406                    mZVP4.x = StpPatPriMotMinOH(pM, StpI2(-1, -1));
2407                    mZVP4.y = StpPatPriMotMinOH(pM, StpI2( 1, -1));
2408                    mZVP4.z = StpPatPriMotMinOH(pM, StpI2(-1,  1));
2409                    mZVP4.w = StpPatPriMotMinOH(pM, StpI2( 1,  1));
2410                #else // STP_OFFSETS
2411                    mZVP4.x = StpPatPriMotMinH(pM + StpF2(-kRcpC.x, -kRcpC.y));
2412                    mZVP4.y = StpPatPriMotMinH(pM + StpF2( kRcpC.x, -kRcpC.y));
2413                    mZVP4.z = StpPatPriMotMinH(pM + StpF2(-kRcpC.x,  kRcpC.y));
2414                    mZVP4.w = StpPatPriMotMinH(pM + StpF2( kRcpC.x,  kRcpC.y));
2415                #endif // ST_OFFSETS
2416            #else // STP_MAX_MIN_UINT
2417                #if STP_OFFSETS
2418                    StpU4 mZVP4_0 = StpPatPriMot4OH(pM, StpI2(-1, -1));
2419                    StpU4 mZVP4_1 = StpPatPriMot4OH(pM, StpI2( 1, -1));
2420                    StpU4 mZVP4_2 = StpPatPriMot4OH(pM, StpI2(-1,  1));
2421                    StpU4 mZVP4_3 = StpPatPriMot4OH(pM, StpI2( 1,  1));
2422                #else // STP_OFFSETS
2423                    StpU4 mZVP4_0 = StpPatPriMot4H(pM + StpF2(-kRcpC.x, -kRcpC.y));
2424                    StpU4 mZVP4_1 = StpPatPriMot4H(pM + StpF2( kRcpC.x, -kRcpC.y));
2425                    StpU4 mZVP4_2 = StpPatPriMot4H(pM + StpF2(-kRcpC.x,  kRcpC.y));
2426                    StpU4 mZVP4_3 = StpPatPriMot4H(pM + StpF2( kRcpC.x,  kRcpC.y));
2427                #endif // STP_OFFSETS
2428            #endif // STP_MAX_MIN_UINT
2429        #else // (STP_SAFE_DILATE == 2)
2430            StpU1 mZVPN;
2431            // To be correct here this needs 'kHalfRcpP' (prior instead of current).
2432            // But didn't want to pass yet another pair of constants, so using current instead.
2433            // TODO: If later moving to 'kHalfRcpP' can use one sample by offset to save some VALU ops.
2434            // Also this is only used if STP_SAFE_DILATE=1 (else dead code).
2435            StpU4 mZVP2a = StpPatPriMot4H(pM - kHalfRcpC);
2436            StpU4 mZVP2b = StpPatPriMot4H(pM + kHalfRcpC);
2437            #if STP_MAX_MIN_UINT
2438                mZVPN = StpPatPriMotMinH(pM);
2439            #else // STP_MAX_MIN_UINT
2440                StpU4 mZVP4 = StpPatPriMot4H(pM);
2441            #endif // STP_MAX_MIN_UINT
2442        #endif // (STP_SAFE_DILATE == 2)
2443//------------------------------------------------------------------------------------------------------------------------------
2444        StpU1 rPre = StpPatDatRH(pp);
2445//------------------------------------------------------------------------------------------------------------------------------
2446        // Gather 4 on feedback.
2447        StpH4 f4R = StpPatPriFedR4H(pF);
2448        StpH4 f4G = StpPatPriFedG4H(pF);
2449        StpH4 f4B = StpPatPriFedB4H(pF);
2450        // Grab bilinear feedback.
2451        StpH3 f = StpPatPriFedH(pF).rgb;
2452//==============================================================================================================================
2453//      DEPENDENT ON DITHER AND INLINE INPUT PARAMETERS
2454//==============================================================================================================================
2455        StpF1 dd = StpF1_(d);
2456        // Convert depth {0 to inf} to {0 to 1} safe for 10-bit value.
2457        StpF1 z = StpPatFixZH(zPre);
2458        z = StpZPack(z, kDepth, dd);
2459        // Pack {MSB depth, LSB 11-bit XY motion}.
2460        rM = StpMvPack(z, m, dd);
2461        StpPatStMotH(pp, rM);
2462//------------------------------------------------------------------------------------------------------------------------------
2463        #if STP_BUG
2464            // Pattern/Clipped Input Color
2465            { StpF4 bug = StpF4_(0.0);
2466                bug.rgb = sqrt(StpF3(c));
2467                bug.rgb = StpSatF3(bug.rgb + StpF3_(StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0)));
2468                StpBugF(StpU3(pp, 0), bug); }
2469//------------------------------------------------------------------------------------------------------------------------------
2470            // Pattern/Log Input Depth
2471            { StpF4 bug = StpF4_(0.0);
2472                bug.rgb = StpF3_(StpSatF1(z + StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0)));
2473                StpBugF(StpU3(pp, 1), bug); }
2474        #endif // STP_BUG
2475//------------------------------------------------------------------------------------------------------------------------------
2476        // Pre-process color.
2477        // If running pre-tonemap, then do a fast reversible tonemapper (convert from {0 to inf} to {0 to 1}).
2478        #if (STP_POSTMAP == 0)
2479            StpToneH3(c);
2480        #endif // (STP_POSTMAP == 0)
2481//------------------------------------------------------------------------------------------------------------------------------
2482        #if STP_BUG
2483            // Pattern/Reversible Tonemapped Input Color
2484            { StpF4 bug = StpF4_(0.0);
2485                bug.rgb = sqrt(StpF3(c));
2486                bug.rgb = StpSatF3(bug.rgb + StpF3_(StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0)));
2487                StpBugF(StpU3(pp, 2), bug); }
2488        #endif // STP_BUG
2489//------------------------------------------------------------------------------------------------------------------------------
2490        // Output intermediate color.
2491        // Dither from linear to gamma 2.0.
2492        // Simple non-energy conserving dither is working, using 10-bit/channel.
2493        c = sqrt(c);
2494        rC.rgb = StpSatH3(c + StpH3_(d * StpH1(1.0 / 1023.0) + StpH1(-0.5 / 1023.0)));
2495//------------------------------------------------------------------------------------------------------------------------------
2496        // Setup the new 3-ring output luma.
2497        rL.x = dot(c, StpH3(STP_LUMA));
2498        rL.y = lum2.x;
2499        StpPatStLumH(pp, rL);
2500//------------------------------------------------------------------------------------------------------------------------------
2501        #if STP_BUG
2502            // Pattern/Shaped Absolute Input Motion
2503            { StpF4 bug = StpF4_(0.0);
2504                bug.b = sqrt(StpF1_(rL.x) * StpF1_(0.25));
2505                bug.rg = StpF2_(1.0) - exp2(abs(StpF2(m)) * StpF2_(-32.0));
2506                bug.rgb = StpSatF3(bug.rgb + StpF3_(StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0)));
2507                StpBugF(StpU3(pp, 3), bug); }
2508        #endif // STP_BUG
2509//------------------------------------------------------------------------------------------------------------------------------
2510        // Minimum change across the 3 frames {current, 2-frame reprojected history}.
2511        StpH1 moire = min(abs(rL.x - lum2.x), abs(lum2.x - lum2.y));
2512        moire *= StpH1_(STP_PAT_DEMOIRE);
2513//------------------------------------------------------------------------------------------------------------------------------
2514        // Grab neighborhood.
2515        // Parallel block {max,-min}, and -min of convergence.
2516        StpH4 xnyRG = StpH4(c.r, -c.r, c.g, -c.g);
2517        StpH4 xnyBC = StpH4(c.b, -c.b, -cnvPrev, -cnvPrev);
2518        #if defined(STP_16BIT)
2519            StpPat4x4MaxH8(lane, xnyRG, xnyBC);
2520        #else // defined(STP_16BIT)
2521            // We convert to full precision floats here since the reductions work on 32-bit values.
2522            StpF4 xnyRGF = StpF4_(xnyRG);
2523            StpF4 xnyBCF = StpF4_(xnyBC);
2524            StpPat4x4MaxF8(lane, xnyRGF, xnyBCF);
2525            xnyRG = StpMF4_(xnyRGF);
2526            xnyBC = StpMF4_(xnyBCF);
2527        #endif // defined(STP_16BIT)
2528        cnvPrev = -xnyBC.z;
2529        // This is max minus min (the '.y' is already negative).
2530        StpH3 ne = max(StpH3_(STP_PAT_NE_MIN) * StpH3(xnyRG.x, xnyRG.z, xnyBC.x),
2531                       StpH3(xnyRG.x + xnyRG.y, xnyRG.z + xnyRG.w, xnyBC.x + xnyBC.y));
2532        StpH1 ne1 = dot(ne, StpH3(STP_LUMA));
2533//------------------------------------------------------------------------------------------------------------------------------
2534        // Advance low frequency convergence.
2535        cnvPrev = StpSatH1(cnvPrev + StpH1_(1.0 / STP_FRAME_MAX));
2536//------------------------------------------------------------------------------------------------------------------------------
2537        // Estimate if reprojection is on-screen.
2538        StpF2 onXY = StpF2(pM.xy);
2539        // {-1 to 1} is on screen.
2540        onXY = onXY * kOS.xy + kOS.zw;
2541        // {0 := offscreen, 1 := onscreen}.
2542        StpF1 onS = StpSignedF1(max(abs(onXY.x), abs(onXY.y)) - StpF1_(1.0));
2543//------------------------------------------------------------------------------------------------------------------------------
2544        #if STP_BUG
2545            // Pattern/Motion Reprojection {R=Prior G=This Sqrt Luma Feedback Diff, B=Offscreen}
2546            { StpF4 bug = StpF4_(0.0);
2547                bug.g = StpF1_(abs(rL.x - lum2.x));
2548                bug.r = StpF1_(abs(lum2.x - lum2.y));
2549                bug.b = StpF1_(1.0) - StpF1_(onS);
2550                bug.rg = sqrt(bug.rg);
2551                bug.rgb = StpSatF3(bug.rgb + StpF3_(StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0)));
2552                StpBugF(StpU3(pp, 4), bug); }
2553        #endif // STP_BUG
2554//==============================================================================================================================
2555//      DEPENDENT ON PRIOR {Z, MOTION}
2556//==============================================================================================================================
2557        // Compute a motion match value.
2558        // Finish {z, motion} nearest dilation.
2559        #if (STP_SAFE_DILATE == 2)
2560            #if (STP_MAX_MIN_UINT == 0)
2561                StpU4 mZVP4 = min(StpMin3U4(mZVP4_0, mZVP4_1, mZVP4_2), mZVP4_3);
2562            #endif // (STP_MAX_MIN_UINT == 0)
2563            StpU1 mZVPN = min(StpMin3U1(mZVP4.x, mZVP4.y, mZVP4.z), mZVP4.w);
2564        #else // (STP_SAFE_DILATE == 2)
2565            #if (STP_MAX_MIN_UINT == 0)
2566                mZVPN = min(StpMin3U1(mZVP4.x, mZVP4.y, mZVP4.z), mZVP4.w);
2567            #endif // (STP_MAX_MIN_UINT == 0)
2568            #if STP_SAFE_DILATE
2569                mZVPN = StpMin3U1(StpMin3U1(mZVPN, mZVP2a.x, mZVP2a.z), mZVP2b.x, mZVP2b.z);
2570            #endif // STP_SAFE_DILATE
2571        #endif // (STP_SAFE_DILATE == 2)
2572//------------------------------------------------------------------------------------------------------------------------------
2573        // The {motion} matching logic.
2574        StpF2 mPN;
2575        StpF1 mZPN;
2576        // Motion 'm' units are {1 := move by one screen}.
2577        StpMvUnpack(mZPN, mPN, mZVPN);
2578//------------------------------------------------------------------------------------------------------------------------------
2579        StpF2 mE;
2580        // Use a smoother error estimate.
2581        // This '1/256' instead of '1/1024' is to be more accepting of a motion match.
2582        // The 'sqrt()' cannot be the low precision approximation without visually seeing differences in the mask.
2583        mE = sqrt(abs(m)) + StpF2_(1.0 / 256.0);
2584        mE = mE * mE - abs(m);
2585//------------------------------------------------------------------------------------------------------------------------------
2586        // Static geometry motion + estimated dynamic motion matching logic.
2587        // Take unpacked low precision {0 to 1} Z and decode to {0 to INF}.
2588        StpF1 sgZ = StpZUnpack(mZPN, kUnDepth);
2589        StpF2 bugF; StpF2 bugD;
2590        StpF2 sgM = StpFor(pM, sgZ, mPN, kMotionMatch, k0123, k4567, k89AB, kCDEF, kGHIJ, kKLMN, kOPQR, kST, bugF, bugD);
2591        // Note 'sgM' is in NDC {-1 to 1} space and 'm' is in {0 to 1} space, thus the 0.5 scaling factor.
2592        // The difference gets conservative possible motion encoding error subtracted out via 'saturate(abs(..)-mE)'.
2593        sgM = StpSatF2(abs(sgM * StpF2_(0.5) - m) - mE) * kC;
2594        StpH1 sgD = StpH1(dot(sgM, sgM));
2595//------------------------------------------------------------------------------------------------------------------------------
2596        // Motion match {0 := no match, 1 := match}.
2597        StpH1 match = StpH1_(1.0) - StpSatH1(sgD * StpH1_(STP_PAT_MOT_AMP) - StpH1_(STP_PAT_MOT_ADD * STP_PAT_MOT_AMP));
2598        // Offscreen is a non-match.
2599        match *= StpH1_(onS);
2600        // Pass motion match in alpha.
2601        rC.a = match;
2602        StpPatStColH(pp, rC);
2603//------------------------------------------------------------------------------------------------------------------------------
2604        // Must disable on non-motion match, but make sure it doesn't fully /0 later.
2605        moire = moire * match + StpH1_(1.0 / 8192.0);
2606        // Scale down temporal change proportional to ratio of local neighborhood and minimum 3-frame temporal change.
2607        moire = min(StpH1_(1.0), ne1 * StpRcpH1(moire));
2608//------------------------------------------------------------------------------------------------------------------------------
2609        // Sensitivity modifiers.
2610        // The following which gets optimized to two FMAs.
2611        //  tS = tS * ((1-v)*k  + 1) ... logic
2612        //  tS = tS * ((1-v)*k) + tS
2613        //  tS = tS * (k-v*k) + tS ..... optimized
2614        StpH1 tS = moire;
2615        StpH1 r = StpPatFixRH(rPre);
2616        tS = tS * (StpH1_(STP_PAT_RESPONSIVE) - r * StpH1_(STP_PAT_RESPONSIVE)) + tS;
2617//------------------------------------------------------------------------------------------------------------------------------
2618        #if STP_BUG
2619            // Pattern/Sensitivity {G=No motion match, R=Responsive, B=Luma}
2620            { StpF4 bug = StpF4_(0.0);
2621                bug.g = StpF1_(1.0) - StpF1(match);
2622                bug.r = StpF1_(1.0) - StpF1(r);
2623                bug.b = StpF1_(rL.x);
2624                bug.rgb = StpSatF3(bug.rgb + StpF3_(StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0)));
2625                StpBugF(StpU3(pp, 5), bug); }
2626        #endif // STP_BUG
2627//==============================================================================================================================
2628//      DEPENDENT ON FEEDBACK
2629//==============================================================================================================================
2630        // Find lowest temporal difference.
2631        StpH4 t;
2632        t.rgb = c - f;
2633        // Luma diff in alpha.
2634        t.a = dot(abs(t.rgb), StpH3(STP_LUMA));
2635        // Compute lowest difference for all in quad.
2636        StpH4 t4R = f4R - StpH4_(c.r);
2637        StpH4 t4G = f4G - StpH4_(c.g);
2638        StpH4 t4B = f4B - StpH4_(c.b);
2639        StpH4 t4A = abs(t4R) * StpH4_(STP_LUMA_R) + abs(t4G) * StpH4_(STP_LUMA_G) + abs(t4B) * StpH4_(STP_LUMA_B);
2640        // Override with lower from gather4.
2641        t.a = StpMin3H1(t.a, t4A.x, StpMin3H1(t4A.y, t4A.z, t4A.w));
2642        if(t.a == t4A.x) t.rgb = StpH3(t4R.x, t4G.x, t4B.x);
2643        if(t.a == t4A.y) t.rgb = StpH3(t4R.y, t4G.y, t4B.y);
2644        if(t.a == t4A.z) t.rgb = StpH3(t4R.z, t4G.z, t4B.z);
2645        if(t.a == t4A.w) t.rgb = StpH3(t4R.w, t4G.w, t4B.w);
2646//------------------------------------------------------------------------------------------------------------------------------
2647        // Factor in sensitivity and reduce.
2648        t.rgb *= StpH3_(tS);
2649//------------------------------------------------------------------------------------------------------------------------------
2650        #if defined(STP_16BIT)
2651            StpPat4x4SumH4(lane, t);
2652        #else // defined(STP_16BIT)
2653            // We convert to full precision floats here since the reductions work on 32-bit values, and MF might be 16-bit.
2654            StpF4 tF = StpF4(t);
2655            StpPat4x4SumF4(lane, tF);
2656            t = StpMF4(tF);
2657        #endif // defined(STP_16BIT)
2658        t.rgb *= StpH3_(STP_PAT_SENSITIVITY);
2659//------------------------------------------------------------------------------------------------------------------------------
2660        // Ratio of 'spatial/temporal' change.
2661        StpH3 bln3 = StpSatH3(ne * StpPrxLoRcpH3(abs(t.rgb)));
2662        // Worst channel limits to avoid chroma ghosting.
2663        StpH1 bln = StpMin3H1(bln3.r, bln3.g, bln3.b);
2664//------------------------------------------------------------------------------------------------------------------------------
2665        // Convert from blend ratio to convergence.
2666        // Note, 'rcp(0)=+INF' when approximations are not used.
2667        StpH1 cnv = StpSatH1(bln * StpPrxLoRcpH1(StpH1_(STP_FRAME_MAX) - StpH1_(STP_FRAME_MAX) * bln));
2668//------------------------------------------------------------------------------------------------------------------------------
2669        // Feedback the min of reprojected convergence, and subtract one frame (as next frame advances by one).
2670        cnv = StpSatH1(cnv - StpH1_(1.0 / STP_FRAME_MAX));
2671        rCnv = min(cnv, cnvPrev);
2672        StpPatStCnvH(pp, rCnv); }
2673#endif // defined(STP_GPU) && defined(STP_16BIT) && defined(STP_PAT)
2674////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2675////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2676////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2677////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2678//_____________________________________________________________.._______________________________________________________________
2679//==============================================================================================================================
2680//
2681//                                                PATTERN DILATION ENTRY POINT
2682//
2683//------------------------------------------------------------------------------------------------------------------------------
2684// This should be pass merged with STP_SAA.
2685// Dilates low frequency convergence.
2686//==============================================================================================================================
2687#if defined(STP_GPU) && defined(STP_32BIT) && defined(STP_DIL)
2688    StpMF1 StpDilDitF(StpMU2 o);
2689    StpMF1 StpDilConF(StpF2 p);
2690    StpMF4 StpDilCon4F(StpF2 p);
2691    #if STP_OFFSETS
2692        StpMF1 StpDilConOF(StpF2 p, StpI2 o);
2693        StpMF4 StpDilCon4OF(StpF2 p, StpI2 o);
2694    #endif // STP_OFFSETS
2695//==============================================================================================================================
2696    void StpDilF(out StpMF1 oC, StpU2 pp, StpU4 con0) {
2697        StpF2 kRcpR = StpF2_U2(con0.xy);
2698//------------------------------------------------------------------------------------------------------------------------------
2699        StpF2 p = StpF2(pp) * kRcpR;
2700//------------------------------------------------------------------------------------------------------------------------------
2701        #if STP_BUG_BW_SOL
2702        { oC = StpDilCon4F(p).x; return; }
2703        #endif // STP_BUG_BW_SOL
2704//------------------------------------------------------------------------------------------------------------------------------
2705        #if STP_OFFSETS
2706            StpMF4 g0 = StpDilCon4OF(p, StpI2(-1.0, -1.0));
2707            StpMF4 g1 = StpDilCon4OF(p, StpI2( 1.0, -1.0));
2708            StpMF4 g2 = StpDilCon4OF(p, StpI2( 3.0, -1.0));
2709            StpMF4 g3 = StpDilCon4OF(p, StpI2(-1.0,  1.0));
2710            StpMF4 g4 = StpDilCon4OF(p, StpI2( 1.0,  1.0));
2711            StpMF4 g5 = StpDilCon4OF(p, StpI2( 3.0,  1.0));
2712            StpMF4 g6 = StpDilCon4OF(p, StpI2(-1.0,  3.0));
2713            StpMF4 g7 = StpDilCon4OF(p, StpI2( 1.0,  3.0));
2714            StpMF4 g8 = StpDilCon4OF(p, StpI2( 3.0,  3.0));
2715        #else // STP_OFFSETS
2716            StpMF4 g0 = StpDilCon4F(p + StpF2(-1.0 * kRcpR.x, -1.0 * kRcpR.y));
2717            StpMF4 g1 = StpDilCon4F(p + StpF2( 1.0 * kRcpR.x, -1.0 * kRcpR.y));
2718            StpMF4 g2 = StpDilCon4F(p + StpF2( 3.0 * kRcpR.x, -1.0 * kRcpR.y));
2719            StpMF4 g3 = StpDilCon4F(p + StpF2(-1.0 * kRcpR.x,  1.0 * kRcpR.y));
2720            StpMF4 g4 = StpDilCon4F(p + StpF2( 1.0 * kRcpR.x,  1.0 * kRcpR.y));
2721            StpMF4 g5 = StpDilCon4F(p + StpF2( 3.0 * kRcpR.x,  1.0 * kRcpR.y));
2722            StpMF4 g6 = StpDilCon4F(p + StpF2(-1.0 * kRcpR.x,  3.0 * kRcpR.y));
2723            StpMF4 g7 = StpDilCon4F(p + StpF2( 1.0 * kRcpR.x,  3.0 * kRcpR.y));
2724            StpMF4 g8 = StpDilCon4F(p + StpF2( 3.0 * kRcpR.x,  3.0 * kRcpR.y));
2725        #endif // STP_OFFSETS
2726//------------------------------------------------------------------------------------------------------------------------------
2727        StpMF1 cA = g0.w;
2728        StpMF1 cB = g0.z;
2729        StpMF1 cC = g1.w;
2730        StpMF1 cD = g1.z;
2731        StpMF1 cE = g2.w;
2732        StpMF1 cF = g0.x;
2733        StpMF1 cG = g0.y;
2734        StpMF1 cH = g1.x;
2735        StpMF1 cI = g1.y;
2736        StpMF1 cJ = g2.x;
2737        StpMF1 cK = g3.w;
2738        StpMF1 cL = g3.z;
2739        StpMF1 cM = g4.w;
2740        StpMF1 cN = g4.z;
2741        StpMF1 cO = g5.w;
2742        StpMF1 cP = g3.x;
2743        StpMF1 cQ = g3.y;
2744        StpMF1 cR = g4.x;
2745        StpMF1 cS = g4.y;
2746        StpMF1 cT = g5.x;
2747        StpMF1 cU = g6.w;
2748        StpMF1 cV = g6.z;
2749        StpMF1 cW = g7.w;
2750        StpMF1 cX = g7.z;
2751        StpMF1 cY = g8.w;
2752//------------------------------------------------------------------------------------------------------------------------------
2753        StpMF4 m1345;
2754        m1345.x = StpMin3MF1(StpMin3MF1(cG, cH, cI), cC, cM);
2755        m1345.y = StpMin3MF1(StpMin3MF1(cK, cL, cM), cG, cQ);
2756        m1345.z = StpMin3MF1(StpMin3MF1(cL, cM, cN), cH, cR);
2757        m1345.w = StpMin3MF1(StpMin3MF1(cM, cN, cO), cI, cS);
2758        StpMF1 m7 = StpMin3MF1(StpMin3MF1(cQ, cR, cS), cM, cW);
2759//------------------------------------------------------------------------------------------------------------------------------
2760        StpMF1 b0 = StpMF1_(0.5);
2761        StpMF1 b1 = (StpMF1_(1.0) - b0) * StpMF1_(0.25);
2762        oC = m1345.z * b0 + m1345.x * b1 + m1345.y * b1 + m1345.w * b1 + m7 * b1; }
2763#endif // defined(STP_GPU) && defined(STP_32BIT) && defined(STP_DIL)
2764////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2765////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2766//_____________________________________________________________.._______________________________________________________________
2767//==============================================================================================================================
2768//                                                         16-BIT PATH
2769//==============================================================================================================================
2770#if defined(STP_GPU) && defined(STP_16BIT) && defined(STP_DIL)
2771    // Some of these are unused, possibly for future experimentation.
2772    StpH1 StpDilDitH(StpW2 o);
2773    StpH1 StpDilConH(StpF2 p);
2774    StpH4 StpDilCon4H(StpF2 p);
2775    #if STP_OFFSETS
2776        StpH1 StpDilConOH(StpF2 p, StpI2 o);
2777        StpH4 StpDilCon4OH(StpF2 p, StpI2 o);
2778    #endif // STP_OFFSETS
2779//==============================================================================================================================
2780    void StpDilH(out StpH1 oC, StpU2 pp, StpU4 con0) {
2781        StpF2 kRcpR = StpF2_U2(con0.xy);
2782        StpF2 p = StpF2(pp) * kRcpR;
2783//------------------------------------------------------------------------------------------------------------------------------
2784        #if STP_BUG_BW_SOL
2785        { oC = StpDilCon4H(p).x; return; }
2786        #endif // STP_BUG_BW_SOL
2787//------------------------------------------------------------------------------------------------------------------------------
2788        // Gather.
2789        //  0   1   2
2790        //
2791        //  3   4   5
2792        //
2793        //  6   7   8
2794        // For.
2795        //  w z w z w z
2796        //  x y.x y x y
2797        //  w z[w]z w z
2798        //  x y x y x y
2799        //  w z w z w z
2800        //  x y x y x y
2801        #if STP_OFFSETS
2802            StpH4 g0 = StpDilCon4OH(p, StpI2(-1.0, -1.0));
2803            StpH4 g1 = StpDilCon4OH(p, StpI2( 1.0, -1.0));
2804            StpH4 g2 = StpDilCon4OH(p, StpI2( 3.0, -1.0));
2805            StpH4 g3 = StpDilCon4OH(p, StpI2(-1.0,  1.0));
2806            StpH4 g4 = StpDilCon4OH(p, StpI2( 1.0,  1.0));
2807            StpH4 g5 = StpDilCon4OH(p, StpI2( 3.0,  1.0));
2808            StpH4 g6 = StpDilCon4OH(p, StpI2(-1.0,  3.0));
2809            StpH4 g7 = StpDilCon4OH(p, StpI2( 1.0,  3.0));
2810            StpH4 g8 = StpDilCon4OH(p, StpI2( 3.0,  3.0));
2811        #else // STP_OFFSETS
2812            StpH4 g0 = StpDilCon4H(p + StpF2(-1.0 * kRcpR.x, -1.0 * kRcpR.y));
2813            StpH4 g1 = StpDilCon4H(p + StpF2( 1.0 * kRcpR.x, -1.0 * kRcpR.y));
2814            StpH4 g2 = StpDilCon4H(p + StpF2( 3.0 * kRcpR.x, -1.0 * kRcpR.y));
2815            StpH4 g3 = StpDilCon4H(p + StpF2(-1.0 * kRcpR.x,  1.0 * kRcpR.y));
2816            StpH4 g4 = StpDilCon4H(p + StpF2( 1.0 * kRcpR.x,  1.0 * kRcpR.y));
2817            StpH4 g5 = StpDilCon4H(p + StpF2( 3.0 * kRcpR.x,  1.0 * kRcpR.y));
2818            StpH4 g6 = StpDilCon4H(p + StpF2(-1.0 * kRcpR.x,  3.0 * kRcpR.y));
2819            StpH4 g7 = StpDilCon4H(p + StpF2( 1.0 * kRcpR.x,  3.0 * kRcpR.y));
2820            StpH4 g8 = StpDilCon4H(p + StpF2( 3.0 * kRcpR.x,  3.0 * kRcpR.y));
2821        #endif // STP_OFFSETS
2822//------------------------------------------------------------------------------------------------------------------------------
2823        // Rename
2824        //  a b c d e
2825        //  f g h i j
2826        //  k l m n o
2827        //  p q r s t
2828        //  u v w x y
2829        StpH1 cA = g0.w;
2830        StpH1 cB = g0.z;
2831        StpH1 cC = g1.w;
2832        StpH1 cD = g1.z;
2833        StpH1 cE = g2.w;
2834        StpH1 cF = g0.x;
2835        StpH1 cG = g0.y;
2836        StpH1 cH = g1.x;
2837        StpH1 cI = g1.y;
2838        StpH1 cJ = g2.x;
2839        StpH1 cK = g3.w;
2840        StpH1 cL = g3.z;
2841        StpH1 cM = g4.w;
2842        StpH1 cN = g4.z;
2843        StpH1 cO = g5.w;
2844        StpH1 cP = g3.x;
2845        StpH1 cQ = g3.y;
2846        StpH1 cR = g4.x;
2847        StpH1 cS = g4.y;
2848        StpH1 cT = g5.x;
2849        StpH1 cU = g6.w;
2850        StpH1 cV = g6.z;
2851        StpH1 cW = g7.w;
2852        StpH1 cX = g7.z;
2853        StpH1 cY = g8.w;
2854//------------------------------------------------------------------------------------------------------------------------------
2855        // 5 point min.
2856        //  . 1 .
2857        //  3 4 5
2858        //  . 7 .
2859        StpH4 m1345;
2860        m1345.x = StpMin3H1(StpMin3H1(cG, cH, cI), cC, cM);
2861        m1345.y = StpMin3H1(StpMin3H1(cK, cL, cM), cG, cQ);
2862        m1345.z = StpMin3H1(StpMin3H1(cL, cM, cN), cH, cR);
2863        m1345.w = StpMin3H1(StpMin3H1(cM, cN, cO), cI, cS);
2864        StpH1 m7 = StpMin3H1(StpMin3H1(cQ, cR, cS), cM, cW);
2865//------------------------------------------------------------------------------------------------------------------------------
2866        StpH1 b0 = StpH1_(0.5);
2867        StpH1 b1 = (StpH1_(1.0) - b0) * StpH1_(0.25);
2868        oC = m1345.z * b0 + m1345.x * b1 + m1345.y * b1 + m1345.w * b1 + m7 * b1; }
2869#endif // defined(STP_GPU) && defined(STP_16BIT) && defined(STP_DIL)
2870////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2871////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2872////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2873////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2874//_____________________________________________________________.._______________________________________________________________
2875//==============================================================================================================================
2876//
2877//                                              SPATIAL ANTI-ALIASING ENTRY POINT
2878//
2879//------------------------------------------------------------------------------------------------------------------------------
2880// This should be pass merged with STP_DIL.
2881// It's a shell, GEAA is separated as a modified form could be useful on its own.
2882//==============================================================================================================================
2883#if defined(STP_GPU) && defined(STP_32BIT) && defined(STP_SAA)
2884    StpMF4 StpSaaLum4F(StpF2 p);
2885    #if STP_OFFSETS
2886        StpMF4 StpSaaLum4OF(StpF2 p, StpI2 o);
2887    #endif
2888//------------------------------------------------------------------------------------------------------------------------------
2889    #define STP_GEAA 1
2890    StpMF4 StpGeaa4F(StpF2 p) { return StpSaaLum4F(p); }
2891    #if STP_OFFSETS
2892        StpMF4 StpGeaa4OF(StpF2 p, StpI2 o) { return StpSaaLum4OF(p, o); }
2893    #endif
2894    void StpGeaaF(out StpMF1 gW, out StpMF1 gLuma, out StpF2 gFilter, out StpF2 gDilate, StpF2 p, StpF2 kRcpI, StpF2 kHalfRcpI);
2895//==============================================================================================================================
2896    void StpSaaF(out StpMF1 oN, StpU2 pp, StpU4 con0) {
2897//------------------------------------------------------------------------------------------------------------------------------
2898        StpF2 kRcpC = StpF2_U2(con0.xy);
2899        StpF2 kHalfRcpC = StpF2_U2(con0.zw);
2900//------------------------------------------------------------------------------------------------------------------------------
2901        StpF2 p = StpF2(pp) * kRcpC + kHalfRcpC;
2902//------------------------------------------------------------------------------------------------------------------------------
2903        #if STP_BUG_BW_SOL
2904        { oN = StpSaaLum4F(p).x; return; }
2905        #endif // STP_BUG_BW_SOL
2906//------------------------------------------------------------------------------------------------------------------------------
2907        StpMF1 gLuma;
2908        StpMF1 gNe;
2909        StpF2 gFilter;
2910        StpF2 gDilate;
2911        StpGeaaF(oN, gLuma, gFilter, gDilate, p, kRcpC, kHalfRcpC); }
2912#endif // defined(STP_GPU) && defined(STP_32BIT) && defined(STP_SAA)
2913////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2914////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2915//_____________________________________________________________.._______________________________________________________________
2916//==============================================================================================================================
2917//                                                         16-BIT PATH
2918//==============================================================================================================================
2919#if defined(STP_GPU) && defined(STP_16BIT) && defined(STP_SAA)
2920    // Gather4 on current luma.
2921    StpH4 StpSaaLum4H(StpF2 p);
2922    #if STP_OFFSETS
2923        StpH4 StpSaaLum4OH(StpF2 p, StpI2 o);
2924    #endif
2925//------------------------------------------------------------------------------------------------------------------------------
2926    #define STP_GEAA 1
2927    StpH4 StpGeaa4H(StpF2 p) { return StpSaaLum4H(p); }
2928    #if STP_OFFSETS
2929        StpH4 StpGeaa4OH(StpF2 p, StpI2 o) { return StpSaaLum4OH(p, o); }
2930    #endif
2931    void StpGeaaH(out StpH1 gW, out StpH1 gLuma, out StpF2 gFilter, out StpF2 gDilate, StpF2 p, StpF2 kRcpI, StpF2 kHalfRcpI);
2932//==============================================================================================================================
2933    void StpSaaH(
2934    out StpH1 oN, // Output control (to be stored).
2935    StpU2 pp,     // Input position {0 to size-1} across the input frame.
2936    StpU4 con0) { // Shared, first constant generated by StpPatCon().
2937//------------------------------------------------------------------------------------------------------------------------------
2938        StpF2 kRcpC = StpF2_U2(con0.xy);
2939        StpF2 kHalfRcpC = StpF2_U2(con0.zw);
2940//------------------------------------------------------------------------------------------------------------------------------
2941        // Float position {0 to 1} across screen.
2942        StpF2 p = StpF2(pp) * kRcpC + kHalfRcpC;
2943//------------------------------------------------------------------------------------------------------------------------------
2944        #if STP_BUG_BW_SOL
2945        { oN = StpSaaLum4H(p).x; return; }
2946        #endif // STP_BUG_BW_SOL
2947//------------------------------------------------------------------------------------------------------------------------------
2948        StpH1 gLuma;   // Spatial AA (unused).
2949        StpH1 gNe;     // Output spatial neighborhood (unused).
2950        StpF2 gFilter; // Output position for anti-aliased color sampling if standalone (unused).
2951        StpF2 gDilate; // Output for {z,motion} dilation (unused).
2952        StpGeaaH(oN, gLuma, gFilter, gDilate, p, kRcpC, kHalfRcpC); }
2953#endif // defined(STP_GPU) && defined(STP_16BIT) && defined(STP_SAA)
2954////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2955////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2956////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2957////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2958//_____________________________________________________________.._______________________________________________________________
2959//==============================================================================================================================
2960//
2961//                                                   SCALING TAA ENTRY POINT
2962//
2963//==============================================================================================================================
2964#if defined(STP_GPU) && defined(STP_TAA) && defined(STP_32BIT)
2965    StpMF4 StpTaaCtl4F(StpF2 p);
2966//------------------------------------------------------------------------------------------------------------------------------
2967    StpMF4 StpTaaCol4RF(StpF2 p);
2968    StpMF4 StpTaaCol4GF(StpF2 p);
2969    StpMF4 StpTaaCol4BF(StpF2 p);
2970    StpMF4 StpTaaCol4AF(StpF2 p);
2971//------------------------------------------------------------------------------------------------------------------------------
2972    StpMF1 StpTaaConF(StpF2 p);
2973//------------------------------------------------------------------------------------------------------------------------------
2974    StpMF1 StpTaaDitF(StpMU2 o);
2975//------------------------------------------------------------------------------------------------------------------------------
2976    StpU4 StpTaaMot4F(StpF2 p);
2977//------------------------------------------------------------------------------------------------------------------------------
2978    StpMF4 StpTaaPriFedF(StpF2 p);
2979    StpMF4 StpTaaPriFed4RF(StpF2 p);
2980    StpMF4 StpTaaPriFed4GF(StpF2 p);
2981    StpMF4 StpTaaPriFed4BF(StpF2 p);
2982    #if STP_MAX_MIN_10BIT
2983        StpMF4 StpTaaPriFedMaxF(StpF2 p);
2984        StpMF4 StpTaaPriFedMinF(StpF2 p);
2985    #endif // STP_MAX_MIN_10BIT
2986    #if STP_OFFSETS
2987        StpMF4 StpTaaPriFedOF(StpF2 p, StpI2 o);
2988        StpMF4 StpTaaPriFed4ROF(StpF2 p, StpI2 o);
2989        StpMF4 StpTaaPriFed4GOF(StpF2 p, StpI2 o);
2990        StpMF4 StpTaaPriFed4BOF(StpF2 p, StpI2 o);
2991    #endif // STP_OFFSETS
2992//==============================================================================================================================
2993    void StpTaaF(
2994    StpMU1 lane,
2995    StpMU2 o,
2996    out StpMF4 rF,
2997    out StpMF4 rW,
2998    StpU4 con0,
2999    StpU4 con1,
3000    StpU4 con2,
3001    StpU4 con3) {
3002//------------------------------------------------------------------------------------------------------------------------------
3003        StpMF1 dit = StpTaaDitF(o);
3004//------------------------------------------------------------------------------------------------------------------------------
3005        StpF2 kCRcpF = StpF2_U2(con0.xy);
3006        StpF2 kHalfCRcpFUnjitC = StpF2_U2(con0.zw);
3007        StpF2 kRcpC = StpF2_U2(con1.xy);
3008        StpF2 kRcpF = StpF2_U2(con1.zw);
3009        StpF2 kHalfRcpF = StpF2_U2(con2.xy);
3010        StpF2 kJitCRcpC0 = StpF2_U2(con2.zw);
3011        StpF2 kHalfRcpC = StpF2_U2(con3.xy);
3012        StpF2 kF = StpF2_U2(con3.zw);
3013//------------------------------------------------------------------------------------------------------------------------------
3014        #if STP_BUG_BW_SOL
3015        {   StpF2 oo = StpF2(o) * kRcpF;
3016            StpMF4 g4 = StpTaaCtl4RF(oo);
3017            StpU4 m4 = StpTaaMot4F(oo);
3018            StpMF1 cnv = StpTaaConF(oo);
3019            StpMF4 f = StpTaaPriFedF(oo);
3020            StpMF4 c4R = StpTaaCol4RF(oo);
3021            rW = rF = l4 + g4 + StpMF4(m4) + StpMF4_(cnv) + f + c4R;
3022            return; }
3023        #endif // STP_BUG_BW_SOL
3024//------------------------------------------------------------------------------------------------------------------------------
3025        StpF2 oI = StpF2(o);
3026        StpF2 oC = oI * kCRcpF + kHalfCRcpFUnjitC;
3027        StpF2 oCNW = floor(oC + StpF2_(-0.5));
3028        StpF2 oC4 = oCNW * kRcpC + kRcpC;
3029        StpF2 oC1 = oC * kRcpC;
3030//==============================================================================================================================
3031//      FETCH {CONVERGENCE, COLOR, CONTROL, Z+MOTION}
3032//==============================================================================================================================
3033        StpMF1 cnv = StpTaaConF(oC1);
3034        StpMF4 c4R = StpTaaCol4RF(oC4);
3035        StpMF4 c4G = StpTaaCol4GF(oC4);
3036        StpMF4 c4B = StpTaaCol4BF(oC4);
3037        StpMF4 c4A = StpTaaCol4AF(oC4);
3038        StpMF4 g4 = StpTaaCtl4F(oC4);
3039        StpU4 m4 = StpTaaMot4F(oC4);
3040//------------------------------------------------------------------------------------------------------------------------------
3041//      INDEPENDENT
3042//------------------------------------------------------------------------------------------------------------------------------
3043        StpMF2 rP = StpMF2(oC - oCNW) - StpMF2_(0.5);
3044//------------------------------------------------------------------------------------------------------------------------------
3045        StpMF2 rPX10 = StpMF2(1.0, 0.0) + StpMF2(-rP.x, rP.x);
3046        StpMF2 rPY01 = StpMF2(0.0, 1.0) + StpMF2(rP.y, -rP.y);
3047        StpMF4 pen4x = StpMF4(rPX10.g, rPX10.r, rPX10.r, rPX10.g);
3048        StpMF4 pen4y = StpMF4(rPY01.g, rPY01.g, rPY01.r, rPY01.r);
3049        StpMF4 pen4 = StpSatMF4(pen4x * pen4x + pen4y * pen4y);
3050//==============================================================================================================================
3051//      DEPENDENT ON {CONVERGENCE}
3052//==============================================================================================================================
3053        cnv = StpSatMF1(cnv - StpMF1_(1.0 / STP_FRAME_MAX));
3054//------------------------------------------------------------------------------------------------------------------------------
3055        StpMF1 pen = StpMF1_(cnv) * StpMF1_(STP_FRAME_MAX) + StpMF1_(1.0);
3056        pen = StpPrxLoSqrtMF1(pen);
3057        pen4 = StpSatMF4(StpMF4_(1.0) - pen4 * StpMF4_(pen));
3058        #if defined(STP_16BIT)
3059        #else // defined(STP_16BIT)
3060            pen = StpSatMF1(pen4.x * pen4.x + pen4.y * pen4.y + pen4.z * pen4.z + pen4.w * pen4.w);
3061        #endif // defined(STP_16BIT)
3062//==============================================================================================================================
3063//      DEPENDENT ON {COLOR}
3064//==============================================================================================================================
3065        StpMF4 wG;
3066        StpMF4 l4 = c4R + c4G * StpMF4_(2.0) + c4B;
3067        StpMF2 difST = abs(l4.gr - l4.ab);
3068        StpP1 useS = difST.x > difST.y;
3069        StpMF2 wTrb = StpSatMF2(StpMF2(-rP.x, rP.x) + StpMF2(rP.y, -rP.y));
3070        StpMF2 wSrb = min(rPX10, rPY01);
3071        if(useS) wTrb = wSrb;
3072        StpMF2 wTga = rPY01 - wTrb;
3073        wG.rg = StpMF2(wTrb.x, wTga.x);
3074        wG.ba = StpMF2(wTrb.y, wTga.y);
3075        wG *= wG;
3076        wG *= wG;
3077//------------------------------------------------------------------------------------------------------------------------------
3078        wG *= g4;
3079        StpMF4 triMask = StpMF4_(1.0);
3080        StpMF2 wGmin2 = min(wG.xy, wG.zw);
3081//==============================================================================================================================
3082//      DEPENDENT ON {Z,MOTION}
3083//==============================================================================================================================
3084        if(wGmin2.x < wGmin2.y) {
3085            if(wG.x < wG.z) { triMask.x = StpMF1_(STP_TAA_TRI_MASK_AVOID); m4.x = 0xFFFFFFFF; }
3086            else            { triMask.z = StpMF1_(STP_TAA_TRI_MASK_AVOID); m4.z = 0xFFFFFFFF; } }
3087        else {
3088            if(wG.y < wG.w) { triMask.y = StpMF1_(STP_TAA_TRI_MASK_AVOID); m4.y = 0xFFFFFFFF; }
3089            else            { triMask.w = StpMF1_(STP_TAA_TRI_MASK_AVOID); m4.w = 0xFFFFFFFF; } }
3090        StpU1 m1 = min(StpMin3U1(m4.x, m4.y, m4.z), m4.w);
3091//------------------------------------------------------------------------------------------------------------------------------
3092        wG *= triMask;
3093//------------------------------------------------------------------------------------------------------------------------------
3094        StpF2 mXY;
3095        StpMvUnpackV(mXY, m1);
3096//==============================================================================================================================
3097//      GET ALL FEEDBACK FILTERING DONE
3098//==============================================================================================================================
3099        StpF2 oF = oI * kRcpF + kHalfRcpF - mXY;
3100//------------------------------------------------------------------------------------------------------------------------------
3101        StpMF3 f;
3102        #if STP_TAA_PRX_LANCZOS
3103            StpF2 oM = oI + StpF2_(0.5) - mXY * kF;
3104            StpF2 oMNW = floor(oM + StpF2_(-0.5));
3105            StpF2 oM4 = oMNW * kRcpF + kRcpF;
3106            StpMF3 fMax, fMin;
3107        #else // STP_TAA_PRX_LANCZOS
3108            f = StpTaaPriFedF(oF).rgb;
3109        #endif // STP_TAA_PRX_LANCZOS
3110//==============================================================================================================================
3111        #if (STP_TAA_PRX_LANCZOS == 1)
3112            #if STP_OFFSETS
3113                StpF2 oM0 = StpF2(oF.x, oM4.y + kRcpF.y * StpF1_(-1.5));
3114                StpMF3 f0 = StpTaaPriFedF(oM0).rgb;
3115                StpMF3 f1 = StpTaaPriFedOF(oM0, StpI2(0, 1)).rgb;
3116                StpMF3 f2 = StpTaaPriFedOF(oM0, StpI2(0, 2)).rgb;
3117                StpMF3 f3 = StpTaaPriFedOF(oM0, StpI2(0, 3)).rgb;
3118            #else // STP_OFFSETS
3119                StpF2 oM0 = StpF2(oF.x, oM4.y + kRcpF.y * StpF1_(-1.5));
3120                StpF2 oM1 = StpF2(oF.x, oM4.y + kRcpF.y * StpF1_(-0.5));
3121                StpF2 oM2 = StpF2(oF.x, oM4.y + kRcpF.y * StpF1_( 0.5));
3122                StpF2 oM3 = StpF2(oF.x, oM4.y + kRcpF.y * StpF1_( 1.5));
3123                StpMF3 f0 = StpTaaPriFedF(oM0).rgb;
3124                StpMF3 f1 = StpTaaPriFedF(oM1).rgb;
3125                StpMF3 f2 = StpTaaPriFedF(oM2).rgb;
3126                StpMF3 f3 = StpTaaPriFedF(oM3).rgb;
3127            #endif // STP_OFFSETS
3128            #if (STP_MAX_MIN_10BIT && STP_TAA_PRX_LANCZOS_DERING)
3129                fMax = StpTaaPriFedMaxF(oM4).rgb;
3130                fMin = StpTaaPriFedMinF(oM4).rgb;
3131            #endif // (STP_MAX_MIN_10BIT && STP_TAA_PRX_LANCZOS_DERING)
3132            #if ((STP_MAX_MIN_10BIT == 0) && STP_TAA_PRX_LANCZOS_DERING)
3133                StpMF4 f4R = StpTaaPriFed4RF(oM4);
3134                StpMF4 f4G = StpTaaPriFed4GF(oM4);
3135                StpMF4 f4B = StpTaaPriFed4BF(oM4);
3136            #endif // ((STP_MAX_MIN_10BIT == 0) && STP_TAA_PRX_LANCZOS_DERING)
3137//------------------------------------------------------------------------------------------------------------------------------
3138//          INDEPENDENT
3139//------------------------------------------------------------------------------------------------------------------------------
3140            StpMF2 fP = StpMF2(oM - oMNW);
3141            StpMF4 fPY = StpMF4_(-fP.y * StpMF1_(0.5)) + StpMF4(-0.5 * 0.5, 0.5 * 0.5, 1.5 * 0.5, 2.5 * 0.5);
3142            fPY = StpSatMF4(StpMF4_(1.0) - fPY * fPY);
3143            fPY *= fPY;
3144            StpMF4 fPY4 = fPY * fPY;
3145            fPY = (StpMF4_(1.0 + 81.0 / 175.0) * fPY4 - StpMF4_(81.0 / 175.0)) * fPY;
3146            #if defined(STP_16BIT)
3147            #else // defined(STP_16BIT)
3148                StpMF1 fRcp = StpPrxLoRcpMF1(fPY.r + fPY.g + fPY.b + fPY.a);
3149            #endif // defined(STP_16BIT)
3150//------------------------------------------------------------------------------------------------------------------------------
3151//          DEPENDENT
3152//------------------------------------------------------------------------------------------------------------------------------
3153            f.rgb = f0 * StpMF3_(fPY.r) + f1 * StpMF3_(fPY.g) + f2 * StpMF3_(fPY.b) + f3 * StpMF3_(fPY.a);
3154            f.rgb *= StpMF3_(fRcp);
3155            #if STP_TAA_PRX_LANCZOS_DERING
3156                #if (STP_MAX_MIN_10BIT == 0)
3157                    #if defined(STP_16BIT)
3158                    #else // defined(STP_16BIT)
3159                        fMax.r = max(StpMax3MF1(f4R.x, f4R.y, f4R.z), f4R.w);
3160                        fMax.g = max(StpMax3MF1(f4G.x, f4G.y, f4G.z), f4G.w);
3161                        fMax.b = max(StpMax3MF1(f4B.x, f4B.y, f4B.z), f4B.w);
3162                        fMin.r = min(StpMin3MF1(f4R.x, f4R.y, f4R.z), f4R.w);
3163                        fMin.g = min(StpMin3MF1(f4G.x, f4G.y, f4G.z), f4G.w);
3164                        fMin.b = min(StpMin3MF1(f4B.x, f4B.y, f4B.z), f4B.w);
3165                        f = clamp(f, fMin, fMax);
3166                    #endif // defined(STP_16BIT)
3167                #else // (STP_MAX_MIN_10BIT == 0)
3168                    f = clamp(f, fMin, fMax);
3169                #endif // (STP_MAX_MIN_10BIT == 0)
3170            #endif // STP_TAA_PRX_LANCZOS_DERING
3171        #endif // (STP_TAA_PRX_LANCZOS == 1)
3172//==============================================================================================================================
3173        #if (STP_TAA_PRX_LANCZOS == 2)
3174            #if STP_OFFSETS
3175                StpMF4 f4R0 = StpTaaPriFed4ROF(oM4, StpI2(-1, -1));
3176                StpMF4 f4G0 = StpTaaPriFed4GOF(oM4, StpI2(-1, -1));
3177                StpMF4 f4B0 = StpTaaPriFed4BOF(oM4, StpI2(-1, -1));
3178                StpMF4 f4R1 = StpTaaPriFed4ROF(oM4, StpI2( 1, -1));
3179                StpMF4 f4G1 = StpTaaPriFed4GOF(oM4, StpI2( 1, -1));
3180                StpMF4 f4B1 = StpTaaPriFed4BOF(oM4, StpI2( 1, -1));
3181                StpMF4 f4R2 = StpTaaPriFed4ROF(oM4, StpI2(-1,  1));
3182                StpMF4 f4G2 = StpTaaPriFed4GOF(oM4, StpI2(-1,  1));
3183                StpMF4 f4B2 = StpTaaPriFed4BOF(oM4, StpI2(-1,  1));
3184                StpMF4 f4R3 = StpTaaPriFed4ROF(oM4, StpI2( 1,  1));
3185                StpMF4 f4G3 = StpTaaPriFed4GOF(oM4, StpI2( 1,  1));
3186                StpMF4 f4B3 = StpTaaPriFed4BOF(oM4, StpI2( 1,  1));
3187            #else // STP_OFFSETS
3188                StpF2 oM0 = oM4 + StpF2(-kRcpF.x, -kRcpF.y);
3189                StpF2 oM1 = oM4 + StpF2( kRcpF.x, -kRcpF.y);
3190                StpF2 oM2 = oM4 + StpF2(-kRcpF.x,  kRcpF.y);
3191                StpF2 oM3 = oM4 + StpF2( kRcpF.x,  kRcpF.y);
3192                StpMF4 f4R0 = StpTaaPriFed4RF(oM0);
3193                StpMF4 f4G0 = StpTaaPriFed4GF(oM0);
3194                StpMF4 f4B0 = StpTaaPriFed4BF(oM0);
3195                StpMF4 f4R1 = StpTaaPriFed4RF(oM1);
3196                StpMF4 f4G1 = StpTaaPriFed4GF(oM1);
3197                StpMF4 f4B1 = StpTaaPriFed4BF(oM1);
3198                StpMF4 f4R2 = StpTaaPriFed4RF(oM2);
3199                StpMF4 f4G2 = StpTaaPriFed4GF(oM2);
3200                StpMF4 f4B2 = StpTaaPriFed4BF(oM2);
3201                StpMF4 f4R3 = StpTaaPriFed4RF(oM3);
3202                StpMF4 f4G3 = StpTaaPriFed4GF(oM3);
3203                StpMF4 f4B3 = StpTaaPriFed4BF(oM3);
3204            #endif // STP_OFFSETS
3205            #if (STP_MAX_MIN_10BIT && STP_TAA_PRX_LANCZOS_DERING)
3206                fMax = StpTaaPriFedMaxF(oM4).rgb;
3207                fMin = StpTaaPriFedMinF(oM4).rgb;
3208            #endif // (STP_MAX_MIN_10BIT && STP_TAA_PRX_LANCZOS_DERING)
3209//------------------------------------------------------------------------------------------------------------------------------
3210//          INDEPENDENT
3211//------------------------------------------------------------------------------------------------------------------------------
3212            StpMF2 fP = StpMF2(oM - oMNW);
3213            StpMF4 fPX = StpMF4_(-fP.x * StpMF1_(0.5)) + StpMF4(-0.5 * 0.5, 0.5 * 0.5, 1.5 * 0.5, 2.5 * 0.5);
3214            StpMF4 fPY = StpMF4_(-fP.y * StpMF1_(0.5)) + StpMF4(-0.5 * 0.5, 0.5 * 0.5, 1.5 * 0.5, 2.5 * 0.5);
3215            fPX = StpSatMF4(StpMF4_(1.0) - fPX * fPX);
3216            fPY = StpSatMF4(StpMF4_(1.0) - fPY * fPY);
3217            fPX *= fPX;
3218            fPY *= fPY;
3219            StpMF4 fPX4 = fPX * fPX;
3220            StpMF4 fPY4 = fPY * fPY;
3221            fPX = (StpMF4_(1.0 + 81.0 / 175.0) * fPX4 - StpMF4_(81.0 / 175.0)) * fPX;
3222            fPY = (StpMF4_(1.0 + 81.0 / 175.0) * fPY4 - StpMF4_(81.0 / 175.0)) * fPY;
3223            #if defined(STP_16BIT)
3224            #else // defined(STP_16BIT)
3225                fPX *= StpMF4_(StpPrxLoRcpMF1(fPX.r + fPX.g + fPX.b + fPX.a));
3226                fPY *= StpMF4_(StpPrxLoRcpMF1(fPY.r + fPY.g + fPY.b + fPY.a));
3227            #endif // defined(STP_16BIT)
3228            StpMF4 fPX0 = fPX * StpMF4_(fPY.r);
3229            StpMF4 fPX1 = fPX * StpMF4_(fPY.g);
3230            StpMF4 fPX2 = fPX * StpMF4_(fPY.b);
3231            StpMF4 fPX3 = fPX * StpMF4_(fPY.a);
3232//------------------------------------------------------------------------------------------------------------------------------
3233//          DEPENDENT
3234//------------------------------------------------------------------------------------------------------------------------------
3235            #if defined(STP_16BIT)
3236            #else // defined(STP_16BIT)
3237                f.r = f4R0.w * fPX0.r + f4R0.z * fPX0.g + f4R1.w * fPX0.b + f4R1.z * fPX0.a +
3238                      f4R0.x * fPX1.r + f4R0.y * fPX1.g + f4R1.x * fPX1.b + f4R1.y * fPX1.a +
3239                      f4R2.w * fPX2.r + f4R2.z * fPX2.g + f4R3.w * fPX2.b + f4R3.z * fPX2.a +
3240                      f4R2.x * fPX3.r + f4R2.y * fPX3.g + f4R3.x * fPX3.b + f4R3.y * fPX3.a;
3241                f.g = f4G0.w * fPX0.r + f4G0.z * fPX0.g + f4G1.w * fPX0.b + f4G1.z * fPX0.a +
3242                      f4G0.x * fPX1.r + f4G0.y * fPX1.g + f4G1.x * fPX1.b + f4G1.y * fPX1.a +
3243                      f4G2.w * fPX2.r + f4G2.z * fPX2.g + f4G3.w * fPX2.b + f4G3.z * fPX2.a +
3244                      f4G2.x * fPX3.r + f4G2.y * fPX3.g + f4G3.x * fPX3.b + f4G3.y * fPX3.a;
3245                f.b = f4B0.w * fPX0.r + f4B0.z * fPX0.g + f4B1.w * fPX0.b + f4B1.z * fPX0.a +
3246                      f4B0.x * fPX1.r + f4B0.y * fPX1.g + f4B1.x * fPX1.b + f4B1.y * fPX1.a +
3247                      f4B2.w * fPX2.r + f4B2.z * fPX2.g + f4B3.w * fPX2.b + f4B3.z * fPX2.a +
3248                      f4B2.x * fPX3.r + f4B2.y * fPX3.g + f4B3.x * fPX3.b + f4B3.y * fPX3.a;
3249            #endif // defined(STP_16BIT)
3250            #if STP_TAA_PRX_LANCZOS_DERING
3251                #if (STP_MAX_MIN_10BIT == 0)
3252                    #if defined(STP_16BIT)
3253                    #else // defined(STP_16BIT)
3254                        fMax.r = max(StpMax3MF1(f4R0.y, f4R1.x, f4R2.z), f4R3.w);
3255                        fMax.g = max(StpMax3MF1(f4G0.y, f4G1.x, f4G2.z), f4G3.w);
3256                        fMax.b = max(StpMax3MF1(f4B0.y, f4B1.x, f4B2.z), f4B3.w);
3257                        fMin.r = min(StpMin3MF1(f4R0.y, f4R1.x, f4R2.z), f4R3.w);
3258                        fMin.g = min(StpMin3MF1(f4G0.y, f4G1.x, f4G2.z), f4G3.w);
3259                        fMin.b = min(StpMin3MF1(f4B0.y, f4B1.x, f4B2.z), f4B3.w);
3260                        f = clamp(f, fMin, fMax);
3261                    #endif // defined(STP_16BIT)
3262                #else // (STP_MAX_MIN_10BIT == 0)
3263                    f = clamp(f, fMin, fMax);
3264                #endif // (STP_MAX_MIN_10BIT == 0)
3265            #endif // STP_TAA_PRX_LANCZOS_DERING
3266        #endif // (STP_TAA_PRX_LANCZOS == 2)
3267//==============================================================================================================================
3268//      DISPLACEMENT
3269//==============================================================================================================================
3270        StpF2 oD0 = oC4 + kJitCRcpC0 - mXY;
3271        StpF2 oD1 = StpF2(kRcpC.x,      0.0) + oD0;
3272        StpF2 oD2 = StpF2(kRcpC.x, -kRcpC.y) + oD0;
3273        StpF2 oD3 = StpF2(0.0,     -kRcpC.y) + oD0;
3274        StpMF3 d0 = StpTaaPriFedF(oD0).rgb;
3275        StpMF3 d1 = StpTaaPriFedF(oD1).rgb;
3276        StpMF3 d2 = StpTaaPriFedF(oD2).rgb;
3277        StpMF3 d3 = StpTaaPriFedF(oD3).rgb;
3278//------------------------------------------------------------------------------------------------------------------------------
3279//      INDEPENDENT
3280//------------------------------------------------------------------------------------------------------------------------------
3281        #if defined(STP_16BIT)
3282        #else // defined(STP_16BIT)
3283            wG = StpSatMF4(wG * StpMF4_(StpPrxLoRcpMF1(wG.x + wG.y + wG.z + wG.w)));
3284        #endif // defined(STP_16BIT)
3285//------------------------------------------------------------------------------------------------------------------------------
3286        StpMF4 wT = abs(c4R - StpMF4_(f.r)) * StpMF4_(STP_LUMA_R) +
3287                    abs(c4G - StpMF4_(f.g)) * StpMF4_(STP_LUMA_G) +
3288                    abs(c4B - StpMF4_(f.b)) * StpMF4_(STP_LUMA_B);
3289        wT = StpPrxLoRcpMF4(wT * StpMF4_(STP_ANTI_MAX) + StpMF4_(STP_ANTI_MIN)) * triMask;
3290//------------------------------------------------------------------------------------------------------------------------------
3291        #if defined(STP_16BIT)
3292        #else // defined(STP_16BIT)
3293            wT = StpSatMF4(wT * StpMF4_(StpPrxLoRcpMF1(wT.x + wT.y + wT.z + wT.w)));
3294        #endif // defined(STP_16BIT)
3295//------------------------------------------------------------------------------------------------------------------------------
3296        StpMF4 wM = wT * StpMF4_(0.5) + wG * StpMF4_(0.5);
3297        #if defined(STP_16BIT)
3298        #else // defined(STP_16BIT)
3299            StpMF1 match = c4A.x * wM.x + c4A.y * wM.y + c4A.z * wM.z + c4A.w * wM.w;
3300        #endif // defined(STP_16BIT)
3301        cnv *= match;
3302//------------------------------------------------------------------------------------------------------------------------------
3303//      DEPENDENT
3304//------------------------------------------------------------------------------------------------------------------------------
3305        StpMF3 dG = d0 * StpMF3_(wG.x) + d1 * StpMF3_(wG.y) + d2 * StpMF3_(wG.z) + d3 * StpMF3_(wG.w);
3306        StpMF3 dT = d0 * StpMF3_(wT.x) + d1 * StpMF3_(wT.y) + d2 * StpMF3_(wT.z) + d3 * StpMF3_(wT.w);
3307//------------------------------------------------------------------------------------------------------------------------------
3308        #if defined(STP_16BIT)
3309        #else // defined(STP_16BIT)
3310            StpMF3 t = StpMF3(
3311                c4R.x * wT.x + c4R.y * wT.y + c4R.z * wT.z + c4R.w * wT.w,
3312                c4G.x * wT.x + c4G.y * wT.y + c4G.z * wT.z + c4G.w * wT.w,
3313                c4B.x * wT.x + c4B.y * wT.y + c4B.z * wT.z + c4B.w * wT.w);
3314            StpMF3 c = StpMF3(
3315                c4R.x * wG.x + c4R.y * wG.y + c4R.z * wG.z + c4R.w * wG.w,
3316                c4G.x * wG.x + c4G.y * wG.y + c4G.z * wG.z + c4G.w * wG.w,
3317                c4B.x * wG.x + c4B.y * wG.y + c4B.z * wG.z + c4B.w * wG.w);
3318        #endif // defined(STP_16BIT)
3319//------------------------------------------------------------------------------------------------------------------------------
3320        StpMF1 bln = StpSatMF1(cnv * StpPrxLoRcpMF1(cnv + StpMF1_(1.0 / STP_FRAME_MAX)));
3321        StpMF1 blnT = StpMF1_(1.0) - bln;
3322        StpMF3 b = f * StpMF3_(bln) + t * StpMF3_(blnT);
3323        StpMF3 minNe = min(c, b);
3324        StpMF3 maxNe = max(c, b);
3325//------------------------------------------------------------------------------------------------------------------------------
3326        StpMF3 penC = StpSatMF3(c + (f - dG) * StpMF3_(StpMF1_(0.9875) * match));
3327        StpMF2 penWF;
3328        penWF.x = pen * StpMF1_(STP_TAA_PEN_W);
3329        penWF.y = pen * lerp(StpMF1_(STP_TAA_PEN_F0), StpMF1_(STP_TAA_PEN_F1), cnv);
3330        StpMF2 penNotWF = StpMF2_(1.0) - penWF;
3331        rF.rgb = t + (f - dT);
3332        rF.rgb = rF.rgb * StpMF3_(blnT) + f * StpMF3_(bln);
3333        rW.rgb = StpSatMF3(rF.rgb * StpMF3_(penNotWF.x) + penC * StpMF3_(penWF.x));
3334        rF.rgb = StpSatMF3(rF.rgb * StpMF3_(penNotWF.y) + penC * StpMF3_(penWF.y));
3335        rW.rgb = clamp(rW.rgb, minNe, maxNe);
3336        rF.rgb = clamp(rF.rgb, minNe, maxNe);
3337//------------------------------------------------------------------------------------------------------------------------------
3338        rW.rgb *= rW.rgb;
3339        #if (STP_POSTMAP == 0)
3340            StpToneInvMF3(rW.rgb);
3341        #endif // (STP_POSTMAP == 0)
3342        rF.a = rW.a = StpMF1(0.0); }
3343#endif // defined(STP_GPU) && defined(STP_TAA) && defined(STP_32BIT)
3344////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3345////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3346//_____________________________________________________________.._______________________________________________________________
3347//==============================================================================================================================
3348//                                                         16-BIT PATH
3349//==============================================================================================================================
3350#if defined(STP_GPU) && defined(STP_TAA) && defined(STP_16BIT)
3351    // Callbacks.
3352    // Gather4 of GEAA control data.
3353    StpH4 StpTaaCtl4H(StpF2 p);
3354//------------------------------------------------------------------------------------------------------------------------------
3355    // Current frame {color,anti} input.
3356    // Gather4 specific channels.
3357    StpH4 StpTaaCol4RH(StpF2 p);
3358    StpH4 StpTaaCol4GH(StpF2 p);
3359    StpH4 StpTaaCol4BH(StpF2 p);
3360    StpH4 StpTaaCol4AH(StpF2 p);
3361//------------------------------------------------------------------------------------------------------------------------------
3362    // Bilinear sampling of low-frequency convergence.
3363    StpH1 StpTaaConH(StpF2 p);
3364//------------------------------------------------------------------------------------------------------------------------------
3365    // Dither value {0 to 1} this should be output pixel frequency spatial temporal blue noise.
3366    StpH1 StpTaaDitH(StpW2 o);
3367//------------------------------------------------------------------------------------------------------------------------------
3368    // Gather4 current frame motion {z,x,y} packed input, same as the 32-bit version (just renamed).
3369    StpU4 StpTaaMot4H(StpF2 p);
3370//------------------------------------------------------------------------------------------------------------------------------
3371    // Feedback {color, alpha}.
3372    // Bilinear fetch with clamp to edge.
3373    StpH4 StpTaaPriFedH(StpF2 p);
3374    // Gather4.
3375    StpH4 StpTaaPriFed4RH(StpF2 p);
3376    StpH4 StpTaaPriFed4GH(StpF2 p);
3377    StpH4 StpTaaPriFed4BH(StpF2 p);
3378    // Min/max sampling used for dering.
3379    #if STP_MAX_MIN_10BIT
3380        StpH4 StpTaaPriFedMaxH(StpF2 p);
3381        StpH4 StpTaaPriFedMinH(StpF2 p);
3382    #endif // STP_MAX_MIN_10BIT
3383    // Sampling with offsets.
3384    #if STP_OFFSETS
3385        StpH4 StpTaaPriFedOH(StpF2 p, StpI2 o);
3386        StpH4 StpTaaPriFed4ROH(StpF2 p, StpI2 o);
3387        StpH4 StpTaaPriFed4GOH(StpF2 p, StpI2 o);
3388        StpH4 StpTaaPriFed4BOH(StpF2 p, StpI2 o);
3389    #endif // STP_OFFSETS
3390//==============================================================================================================================
3391    void StpTaaH(
3392    StpW1 lane,   // Currently unused but in the interface for possible future expansion.
3393    StpW2 o,      // Integer pixel offset in output.
3394    out StpH4 rF, // Return Feedback (to be stored).
3395    out StpH4 rW, // Return Output (to be stored).
3396    StpU4 con0,   // Constants generated by StpTaaCon().
3397    StpU4 con1,
3398    StpU4 con2,
3399    StpU4 con3) {
3400//------------------------------------------------------------------------------------------------------------------------------
3401        // This is only currently used for debug.
3402        StpH1 dit = StpTaaDitH(o);
3403//------------------------------------------------------------------------------------------------------------------------------
3404        // Rename constants.
3405        StpF2 kCRcpF = StpF2_U2(con0.xy);
3406        StpF2 kHalfCRcpFUnjitC = StpF2_U2(con0.zw);
3407        StpF2 kRcpC = StpF2_U2(con1.xy);
3408        StpF2 kRcpF = StpF2_U2(con1.zw);
3409        StpF2 kHalfRcpF = StpF2_U2(con2.xy);
3410        StpF2 kJitCRcpC0 = StpF2_U2(con2.zw);
3411        StpF2 kHalfRcpC = StpF2_U2(con3.xy);
3412        StpF2 kF = StpF2_U2(con3.zw);
3413//------------------------------------------------------------------------------------------------------------------------------
3414        // Check the streaming bandwidth limit.
3415        #if STP_BUG_BW_SOL
3416        {   StpF2 oo = StpF2(o) * kRcpF;
3417            StpH4 g4 = StpTaaCtl4RH(oo);
3418            StpU4 m4 = StpTaaMot4H(oo);
3419            StpH1 cnv = StpTaaConH(oo);
3420            StpH4 f = StpTaaPriFedH(oo);
3421            StpH4 c4R = StpTaaCol4RH(oo);
3422            rW = rF = l4 + g4 + StpH4(m4) + StpH4_(cnv) + f + c4R;
3423            return; }
3424        #endif // STP_BUG_BW_SOL
3425//------------------------------------------------------------------------------------------------------------------------------
3426        // Locate 2x2 neighborhood.
3427        // Float version of integer pixel offset in output.
3428        // All the 'o' prefixed variables are offset (aka position/coordinate) related.
3429        StpF2 oI = StpF2(o);
3430        // This gets to the center of the 2x2 quad directly because of possibility of shader/tex precision mismatch.
3431        // Precision mismatch could yield different 2x2 quads.
3432        StpF2 oC = oI * kCRcpF + kHalfCRcpFUnjitC;
3433        // NW of 2x2 quad.
3434        StpF2 oCNW = floor(oC + StpF2_(-0.5));
3435        // Center of the 2x2 quad.
3436        StpF2 oC4 = oCNW * kRcpC + kRcpC;
3437        // Coordinates for low frequency convergence.
3438        StpF2 oC1 = oC * kRcpC;
3439//==============================================================================================================================
3440//      FETCH {CONVERGENCE, COLOR, CONTROL, Z+MOTION}
3441//==============================================================================================================================
3442        // Fetch low-frequency convergence.
3443        StpH1 cnv = StpTaaConH(oC1);
3444        // Fetch color.
3445        StpH4 c4R = StpTaaCol4RH(oC4);
3446        StpH4 c4G = StpTaaCol4GH(oC4);
3447        StpH4 c4B = StpTaaCol4BH(oC4);
3448        StpH4 c4A = StpTaaCol4AH(oC4);
3449        // Control (GEAA weights)
3450        StpH4 g4 = StpTaaCtl4H(oC4);
3451        // Fetch {z,motion}.
3452        StpU4 m4 = StpTaaMot4H(oC4);
3453//------------------------------------------------------------------------------------------------------------------------------
3454//      INDEPENDENT
3455//------------------------------------------------------------------------------------------------------------------------------
3456        // Setup resolve position {0 to 1} inside 2x2 quad.
3457        // The extra -0.5 is to get from NW position to center.
3458        StpH2 rP = StpH2(oC - oCNW) - StpH2_(0.5);
3459//------------------------------------------------------------------------------------------------------------------------------
3460        // The 'rP' is resolve position {0 to 1} inside 2x2 quad, this is distance to ends of 2x2.
3461        // Instead of using {a,a-1} this uses {a,1-a} for reuse with the simple angular filtering.
3462        StpH2 rPX10 = StpH2(1.0, 0.0) + StpH2(-rP.x, rP.x);
3463        StpH2 rPY01 = StpH2(0.0, 1.0) + StpH2(rP.y, -rP.y);
3464        // Distance^2 {0 := on, 1 := off}.
3465        StpH4 pen4x = StpH4(rPX10.g, rPX10.r, rPX10.r, rPX10.g);
3466        StpH4 pen4y = StpH4(rPY01.g, rPY01.g, rPY01.r, rPY01.r);
3467        // Pen starts with distance squared to all 2x2 points.
3468        StpH4 pen4 = StpSatH4(pen4x * pen4x + pen4y * pen4y);
3469//==============================================================================================================================
3470//      DEPENDENT ON {CONVERGENCE}
3471//==============================================================================================================================
3472        // Low frequency convergence keeps the next frame value, so subtract one frame.
3473        cnv = StpSatH1(cnv - StpH1_(1.0 / STP_FRAME_MAX));
3474//------------------------------------------------------------------------------------------------------------------------------
3475        // Pen size based on convergence.
3476        StpH1 pen = StpH1_(cnv) * StpH1_(STP_FRAME_MAX) + StpH1_(1.0);
3477        pen = StpPrxLoSqrtH1(pen);
3478        pen4 = StpSatH4(StpH4_(1.0) - pen4 * StpH4_(pen));
3479        #if defined(STP_16BIT)
3480            StpH2 pen2 = pen4.xy * pen4.xy + pen4.zw * pen4.zw;
3481            pen = StpSatH1(pen2.x + pen2.y);
3482        #else // defined(STP_16BIT)
3483            pen = StpSatMF1(pen4.x * pen4.x + pen4.y * pen4.y + pen4.z * pen4.z + pen4.w * pen4.w);
3484        #endif // defined(STP_16BIT)
3485//==============================================================================================================================
3486//      DEPENDENT ON {COLOR}
3487//==============================================================================================================================
3488        // Simple angular filtering (gets rid of block artifacts, adds sawtooth artifacts which are not a problem in practice).
3489        // Create a GEAA based weighting for no temporal feedback case.
3490        StpH4 wG;
3491        // Selects between either (S) or (T).
3492        //  (S) A--B ... (T) A--B
3493        //      |\ |         | /|
3494        //      | \|         |/ |
3495        //      R--G         R--G
3496        // S and T only use the other diagonal.
3497        // Exact luma not required.
3498        StpH4 l4 = c4R + c4G * StpH4_(2.0) + c4B;
3499        StpH2 difST = abs(l4.gr - l4.ab);
3500        // Choose configuration based on which difference is maximum.
3501        StpP1 useS = difST.x > difST.y;
3502        // Choose interpolation weights given the configuration.
3503        //      _T__________  _S__________
3504        //  R | sat( -x+  y)  min(1-x,  y) = y-G
3505        //  G | min(  x,  y)  sat(x-1+  y) = y-R
3506        //  B | sat(  x-  y)  min(  x,1-y) = (1-y)-A
3507        //  A | min(1-x,1-y)  sat(1-x-  y) = (1-y)-B
3508        // Difference between S and T is a {x} vs {1-x} and a RGBA vs GRAB swap.
3509        StpH2 wTrb = StpSatH2(StpH2(-rP.x, rP.x) + StpH2(rP.y, -rP.y));
3510        StpH2 wSrb = min(rPX10, rPY01);
3511        if(useS) wTrb = wSrb;
3512        StpH2 wTga = rPY01 - wTrb;
3513        wG.rg = StpH2(wTrb.x, wTga.x);
3514        wG.ba = StpH2(wTrb.y, wTga.y);
3515        // Shaping is needed to get good high area scaling (remove the transition region).
3516        wG *= wG;
3517        wG *= wG;
3518//------------------------------------------------------------------------------------------------------------------------------
3519        // Scale directional interpolation weights by GEAA weights to introduce anti-aliasing.
3520        wG *= g4;
3521        // Triangular nearest.
3522        // This works by removing the corner which contributes the least to the spatial interpolated result.
3523        StpH4 triMask = StpH4_(1.0);
3524        StpH2 wGmin2 = min(wG.xy, wG.zw);
3525//==============================================================================================================================
3526//      DEPENDENT ON {Z,MOTION}
3527//==============================================================================================================================
3528        // This overwrites gather4 results.
3529        if(wGmin2.x < wGmin2.y) {
3530            if(wG.x < wG.z) { triMask.x = StpH1_(STP_TAA_TRI_MASK_AVOID); m4.x = 0xFFFFFFFF; }
3531            else            { triMask.z = StpH1_(STP_TAA_TRI_MASK_AVOID); m4.z = 0xFFFFFFFF; } }
3532        else {
3533            if(wG.y < wG.w) { triMask.y = StpH1_(STP_TAA_TRI_MASK_AVOID); m4.y = 0xFFFFFFFF; }
3534            else            { triMask.w = StpH1_(STP_TAA_TRI_MASK_AVOID); m4.w = 0xFFFFFFFF; } }
3535        StpU1 m1 = min(StpMin3U1(m4.x, m4.y, m4.z), m4.w);
3536//------------------------------------------------------------------------------------------------------------------------------
3537        // Want to consume 'triMask' to free up register space.
3538        wG *= triMask;
3539//------------------------------------------------------------------------------------------------------------------------------
3540        StpF2 mXY;
3541        // Motion 'm' units are {1 := move by one screen}.
3542        StpMvUnpackV(mXY, m1);
3543//==============================================================================================================================
3544//      GET ALL FEEDBACK FILTERING DONE
3545//==============================================================================================================================
3546        // This region of code will have the highest register pressure in some configs, so doing as early as possible.
3547        // Setup for fetch feedback.
3548        StpF2 oF = oI * kRcpF + kHalfRcpF - mXY;
3549//------------------------------------------------------------------------------------------------------------------------------
3550        StpH3 f;
3551        // Lanczos common.
3552        #if STP_TAA_PRX_LANCZOS
3553            // Motion reprojection position in feedback pixels.
3554            StpF2 oM = oI + StpF2_(0.5) - mXY * kF;
3555            // NW of center 2x2 quad.
3556            StpF2 oMNW = floor(oM + StpF2_(-0.5));
3557            // Center of the center 2x2 quad.
3558            StpF2 oM4 = oMNW * kRcpF + kRcpF;
3559            StpH3 fMax, fMin;
3560        #else // STP_TAA_PRX_LANCZOS
3561            // Sample nearest feedback.
3562            f = StpTaaPriFedH(oF).rgb;
3563        #endif // STP_TAA_PRX_LANCZOS
3564//==============================================================================================================================
3565        #if (STP_TAA_PRX_LANCZOS == 1)
3566            // This one does a fixed 1x4 to try to cut cost in half relative to the complete 4x4.
3567            // It uses bilinear sampling on the 'x'.
3568            // Lanczos on the 'y' because most floating camera motion is 'y' based.
3569            // Fetch {feedback}.
3570            #if STP_OFFSETS
3571                // TODO: Can optimize out the 'oM4.y' add with constant change.
3572                StpF2 oM0 = StpF2(oF.x, oM4.y + kRcpF.y * StpF1_(-1.5));
3573                StpH3 f0 = StpTaaPriFedH(oM0).rgb;
3574                StpH3 f1 = StpTaaPriFedOH(oM0, StpI2(0, 1)).rgb;
3575                StpH3 f2 = StpTaaPriFedOH(oM0, StpI2(0, 2)).rgb;
3576                StpH3 f3 = StpTaaPriFedOH(oM0, StpI2(0, 3)).rgb;
3577            #else // STP_OFFSETS
3578                StpF2 oM0 = StpF2(oF.x, oM4.y + kRcpF.y * StpF1_(-1.5));
3579                StpF2 oM1 = StpF2(oF.x, oM4.y + kRcpF.y * StpF1_(-0.5));
3580                StpF2 oM2 = StpF2(oF.x, oM4.y + kRcpF.y * StpF1_( 0.5));
3581                StpF2 oM3 = StpF2(oF.x, oM4.y + kRcpF.y * StpF1_( 1.5));
3582                StpH3 f0 = StpTaaPriFedH(oM0).rgb;
3583                StpH3 f1 = StpTaaPriFedH(oM1).rgb;
3584                StpH3 f2 = StpTaaPriFedH(oM2).rgb;
3585                StpH3 f3 = StpTaaPriFedH(oM3).rgb;
3586            #endif // STP_OFFSETS
3587            // Want this last because it's used last.
3588            #if (STP_MAX_MIN_10BIT && STP_TAA_PRX_LANCZOS_DERING)
3589                fMax = StpTaaPriFedMaxH(oM4).rgb;
3590                fMin = StpTaaPriFedMinH(oM4).rgb;
3591            #endif // (STP_MAX_MIN_10BIT && STP_TAA_PRX_LANCZOS_DERING)
3592            #if ((STP_MAX_MIN_10BIT == 0) && STP_TAA_PRX_LANCZOS_DERING)
3593                // Without {min,max} sampling, must gather4.
3594                StpH4 f4R = StpTaaPriFed4RH(oM4);
3595                StpH4 f4G = StpTaaPriFed4GH(oM4);
3596                StpH4 f4B = StpTaaPriFed4BH(oM4);
3597            #endif // ((STP_MAX_MIN_10BIT == 0) && STP_TAA_PRX_LANCZOS_DERING)
3598//------------------------------------------------------------------------------------------------------------------------------
3599//          INDEPENDENT
3600//------------------------------------------------------------------------------------------------------------------------------
3601            // Convert to approximate lanczos weights.
3602            // Feedback position {0 to 1} inside 2x2 quad + 0.5.
3603            StpH2 fP = StpH2(oM - oMNW);
3604            // Convert to approximate lanczos weights.
3605            // This converts {-2 to 2} to {-1 to 1} because the kernel approximation is written for {-1 to 1}.
3606            StpH4 fPY = StpH4_(-fP.y * StpH1_(0.5)) + StpH4(-0.5 * 0.5, 0.5 * 0.5, 1.5 * 0.5, 2.5 * 0.5);
3607            // Weights in one axis.
3608            fPY = StpSatH4(StpH4_(1.0) - fPY * fPY);
3609            fPY *= fPY;
3610            StpH4 fPY4 = fPY * fPY;
3611            // ^6 (slightly more negative lobe than lanczos 2, slightly less expensive)
3612            fPY = (StpH4_(1.0 + 81.0 / 175.0) * fPY4 - StpH4_(81.0 / 175.0)) * fPY;
3613            #if defined(STP_16BIT)
3614                StpH2 fRcp2 = fPY.rg + fPY.ba;
3615                StpH1 fRcp = StpPrxLoRcpH1(fRcp2.x + fRcp2.y);
3616            #else // defined(STP_16BIT)
3617                StpMF1 fRcp = StpPrxLoRcpMF1(fPY.r + fPY.g + fPY.b + fPY.a);
3618            #endif // defined(STP_16BIT)
3619//------------------------------------------------------------------------------------------------------------------------------
3620//          DEPENDENT
3621//------------------------------------------------------------------------------------------------------------------------------
3622            f.rgb = f0 * StpH3_(fPY.r) + f1 * StpH3_(fPY.g) + f2 * StpH3_(fPY.b) + f3 * StpH3_(fPY.a);
3623            f.rgb *= StpH3_(fRcp);
3624            #if STP_TAA_PRX_LANCZOS_DERING
3625                #if (STP_MAX_MIN_10BIT == 0)
3626                    #if defined(STP_16BIT)
3627                        StpH2 fXnyR = max(max(StpH2(f4R.x, -f4R.x), StpH2(f4R.y, -f4R.y)),
3628                                          max(StpH2(f4R.z, -f4R.z), StpH2(f4R.w, -f4R.w)));
3629                        StpH2 fXnyG = max(max(StpH2(f4G.x, -f4G.x), StpH2(f4G.y, -f4G.y)),
3630                                          max(StpH2(f4G.z, -f4G.z), StpH2(f4G.w, -f4G.w)));
3631                        StpH2 fXnyB = max(max(StpH2(f4B.x, -f4B.x), StpH2(f4B.y, -f4B.y)),
3632                                          max(StpH2(f4B.z, -f4B.z), StpH2(f4B.w, -f4B.w)));
3633                        f = clamp(f, StpH3(-fXnyR.y, -fXnyG.y, -fXnyB.y), StpH3(fXnyR.x, fXnyG.x, fXnyB.x));
3634                    #else // defined(STP_16BIT)
3635                        fMax.r = max(StpMax3H1(f4R.x, f4R.y, f4R.z), f4R.w);
3636                        fMax.g = max(StpMax3H1(f4G.x, f4G.y, f4G.z), f4G.w);
3637                        fMax.b = max(StpMax3H1(f4B.x, f4B.y, f4B.z), f4B.w);
3638                        fMin.r = min(StpMin3H1(f4R.x, f4R.y, f4R.z), f4R.w);
3639                        fMin.g = min(StpMin3H1(f4G.x, f4G.y, f4G.z), f4G.w);
3640                        fMin.b = min(StpMin3H1(f4B.x, f4B.y, f4B.z), f4B.w);
3641                        f = clamp(f, fMin, fMax);
3642                    #endif // defined(STP_16BIT)
3643                #else // (STP_MAX_MIN_10BIT == 0)
3644                    // Leaning on {min,max} sampling so no 16/32-bit permutation.
3645                    f = clamp(f, fMin, fMax);
3646                #endif // (STP_MAX_MIN_10BIT == 0)
3647            #endif // STP_TAA_PRX_LANCZOS_DERING
3648        #endif // (STP_TAA_PRX_LANCZOS == 1)
3649//==============================================================================================================================
3650        #if (STP_TAA_PRX_LANCZOS == 2)
3651            // Unstable approximate lanczos feedback, full 4x4.
3652            //  a = saturate(1-x*x)
3653            //  u = 1+v
3654            //  v = moves the zero crossing to 0.5
3655            //  w = adjusts the shape
3656            //  u*a^w - v*a^2
3657            // Fetch {feedback}.
3658            //  0w 0z 1w 1z | R
3659            //  0x 0y 1x 1y | G
3660            //  2w 2z 3w 3z | B
3661            //  2x 2y 3x 3y | A
3662            //  -- -- -- --
3663            //  R  G  B  A
3664            #if STP_OFFSETS
3665                StpH4 f4R0 = StpTaaPriFed4ROH(oM4, StpI2(-1, -1));
3666                StpH4 f4G0 = StpTaaPriFed4GOH(oM4, StpI2(-1, -1));
3667                StpH4 f4B0 = StpTaaPriFed4BOH(oM4, StpI2(-1, -1));
3668                StpH4 f4R1 = StpTaaPriFed4ROH(oM4, StpI2( 1, -1));
3669                StpH4 f4G1 = StpTaaPriFed4GOH(oM4, StpI2( 1, -1));
3670                StpH4 f4B1 = StpTaaPriFed4BOH(oM4, StpI2( 1, -1));
3671                StpH4 f4R2 = StpTaaPriFed4ROH(oM4, StpI2(-1,  1));
3672                StpH4 f4G2 = StpTaaPriFed4GOH(oM4, StpI2(-1,  1));
3673                StpH4 f4B2 = StpTaaPriFed4BOH(oM4, StpI2(-1,  1));
3674                StpH4 f4R3 = StpTaaPriFed4ROH(oM4, StpI2( 1,  1));
3675                StpH4 f4G3 = StpTaaPriFed4GOH(oM4, StpI2( 1,  1));
3676                StpH4 f4B3 = StpTaaPriFed4BOH(oM4, StpI2( 1,  1));
3677            #else // STP_OFFSETS
3678                StpF2 oM0 = oM4 + StpF2(-kRcpF.x, -kRcpF.y);
3679                StpF2 oM1 = oM4 + StpF2( kRcpF.x, -kRcpF.y);
3680                StpF2 oM2 = oM4 + StpF2(-kRcpF.x,  kRcpF.y);
3681                StpF2 oM3 = oM4 + StpF2( kRcpF.x,  kRcpF.y);
3682                StpH4 f4R0 = StpTaaPriFed4RH(oM0);
3683                StpH4 f4G0 = StpTaaPriFed4GH(oM0);
3684                StpH4 f4B0 = StpTaaPriFed4BH(oM0);
3685                StpH4 f4R1 = StpTaaPriFed4RH(oM1);
3686                StpH4 f4G1 = StpTaaPriFed4GH(oM1);
3687                StpH4 f4B1 = StpTaaPriFed4BH(oM1);
3688                StpH4 f4R2 = StpTaaPriFed4RH(oM2);
3689                StpH4 f4G2 = StpTaaPriFed4GH(oM2);
3690                StpH4 f4B2 = StpTaaPriFed4BH(oM2);
3691                StpH4 f4R3 = StpTaaPriFed4RH(oM3);
3692                StpH4 f4G3 = StpTaaPriFed4GH(oM3);
3693                StpH4 f4B3 = StpTaaPriFed4BH(oM3);
3694            #endif // STP_OFFSETS
3695            // Want this last because it's used last.
3696            #if (STP_MAX_MIN_10BIT && STP_TAA_PRX_LANCZOS_DERING)
3697                fMax = StpTaaPriFedMaxH(oM4).rgb;
3698                fMin = StpTaaPriFedMinH(oM4).rgb;
3699            #endif // (STP_MAX_MIN_10BIT && STP_TAA_PRX_LANCZOS_DERING)
3700//------------------------------------------------------------------------------------------------------------------------------
3701//          INDEPENDENT
3702//------------------------------------------------------------------------------------------------------------------------------
3703            // Feedback position {0 to 1} inside 2x2 quad + 0.5.
3704            StpH2 fP = StpH2(oM - oMNW);
3705            // Convert to approximate lanczos weights.
3706            // This converts {-2 to 2} to {-1 to 1} because the kernel approximation is written for {-1 to 1}.
3707            StpH4 fPX = StpH4_(-fP.x * StpH1_(0.5)) + StpH4(-0.5 * 0.5, 0.5 * 0.5, 1.5 * 0.5, 2.5 * 0.5);
3708            StpH4 fPY = StpH4_(-fP.y * StpH1_(0.5)) + StpH4(-0.5 * 0.5, 0.5 * 0.5, 1.5 * 0.5, 2.5 * 0.5);
3709            // Weights in both axis.
3710            fPX = StpSatH4(StpH4_(1.0) - fPX * fPX);
3711            fPY = StpSatH4(StpH4_(1.0) - fPY * fPY);
3712            fPX *= fPX;
3713            fPY *= fPY;
3714            StpH4 fPX4 = fPX * fPX;
3715            StpH4 fPY4 = fPY * fPY;
3716            // ^6 (slightly more negative lobe than lanczos 2, slightly less expensive)
3717            fPX = (StpH4_(1.0 + 81.0 / 175.0) * fPX4 - StpH4_(81.0 / 175.0)) * fPX;
3718            fPY = (StpH4_(1.0 + 81.0 / 175.0) * fPY4 - StpH4_(81.0 / 175.0)) * fPY;
3719            #if defined(STP_16BIT)
3720                StpH2 fRcpX = fPX.rg + fPX.ba;
3721                StpH2 fRcpY = fPY.rg + fPY.ba;
3722                fPX *= StpH4_(StpPrxLoRcpH1(fRcpX.r + fRcpX.y));
3723                fPY *= StpH4_(StpPrxLoRcpH1(fRcpY.r + fRcpY.y));
3724            #else // defined(STP_16BIT)
3725                fPX *= StpMF4_(StpPrxLoRcpMF1(fPX.r + fPX.g + fPX.b + fPX.a));
3726                fPY *= StpMF4_(StpPrxLoRcpMF1(fPY.r + fPY.g + fPY.b + fPY.a));
3727            #endif // defined(STP_16BIT)
3728            StpH4 fPX0 = fPX * StpH4_(fPY.r);
3729            StpH4 fPX1 = fPX * StpH4_(fPY.g);
3730            StpH4 fPX2 = fPX * StpH4_(fPY.b);
3731            StpH4 fPX3 = fPX * StpH4_(fPY.a);
3732//------------------------------------------------------------------------------------------------------------------------------
3733//          DEPENDENT
3734//------------------------------------------------------------------------------------------------------------------------------
3735            #if defined(STP_16BIT)
3736                StpH2 fR2 = f4R0.wz * fPX0.xy + f4R1.wz * fPX0.zw + f4R0.xy * fPX1.xy + f4R1.xy * fPX1.zw +
3737                            f4R2.wz * fPX2.xy + f4R3.wz * fPX2.zw + f4R2.xy * fPX3.xy + f4R3.xy * fPX3.zw;
3738                StpH2 fG2 = f4G0.wz * fPX0.xy + f4G1.wz * fPX0.zw + f4G0.xy * fPX1.xy + f4G1.xy * fPX1.zw +
3739                            f4G2.wz * fPX2.xy + f4G3.wz * fPX2.zw + f4G2.xy * fPX3.xy + f4G3.xy * fPX3.zw;
3740                StpH2 fB2 = f4B0.wz * fPX0.xy + f4B1.wz * fPX0.zw + f4B0.xy * fPX1.xy + f4B1.xy * fPX1.zw +
3741                            f4B2.wz * fPX2.xy + f4B3.wz * fPX2.zw + f4B2.xy * fPX3.xy + f4B3.xy * fPX3.zw;
3742                f = StpH3(fR2.x + fR2.y, fG2.x + fG2.y, fB2.x + fB2.y);
3743            #else // defined(STP_16BIT)
3744                f.r = f4R0.w * fPX0.r + f4R0.z * fPX0.g + f4R1.w * fPX0.b + f4R1.z * fPX0.a +
3745                      f4R0.x * fPX1.r + f4R0.y * fPX1.g + f4R1.x * fPX1.b + f4R1.y * fPX1.a +
3746                      f4R2.w * fPX2.r + f4R2.z * fPX2.g + f4R3.w * fPX2.b + f4R3.z * fPX2.a +
3747                      f4R2.x * fPX3.r + f4R2.y * fPX3.g + f4R3.x * fPX3.b + f4R3.y * fPX3.a;
3748                f.g = f4G0.w * fPX0.r + f4G0.z * fPX0.g + f4G1.w * fPX0.b + f4G1.z * fPX0.a +
3749                      f4G0.x * fPX1.r + f4G0.y * fPX1.g + f4G1.x * fPX1.b + f4G1.y * fPX1.a +
3750                      f4G2.w * fPX2.r + f4G2.z * fPX2.g + f4G3.w * fPX2.b + f4G3.z * fPX2.a +
3751                      f4G2.x * fPX3.r + f4G2.y * fPX3.g + f4G3.x * fPX3.b + f4G3.y * fPX3.a;
3752                f.b = f4B0.w * fPX0.r + f4B0.z * fPX0.g + f4B1.w * fPX0.b + f4B1.z * fPX0.a +
3753                      f4B0.x * fPX1.r + f4B0.y * fPX1.g + f4B1.x * fPX1.b + f4B1.y * fPX1.a +
3754                      f4B2.w * fPX2.r + f4B2.z * fPX2.g + f4B3.w * fPX2.b + f4B3.z * fPX2.a +
3755                      f4B2.x * fPX3.r + f4B2.y * fPX3.g + f4B3.x * fPX3.b + f4B3.y * fPX3.a;
3756            #endif // defined(STP_16BIT)
3757            #if STP_TAA_PRX_LANCZOS_DERING
3758                #if (STP_MAX_MIN_10BIT == 0)
3759                    #if defined(STP_16BIT)
3760                        StpH2 fXnyR = max(max(StpH2(f4R0.y, -f4R0.y), StpH2(f4R1.x, -f4R1.x)),
3761                                          max(StpH2(f4R2.z, -f4R2.z), StpH2(f4R3.w, -f4R3.w)));
3762                        StpH2 fXnyG = max(max(StpH2(f4G0.y, -f4G0.y), StpH2(f4G1.x, -f4G1.x)),
3763                                          max(StpH2(f4G2.z, -f4G2.z), StpH2(f4G3.w, -f4G3.w)));
3764                        StpH2 fXnyB = max(max(StpH2(f4B0.y, -f4B0.y), StpH2(f4B1.x, -f4B1.x)),
3765                                          max(StpH2(f4B2.z, -f4B2.z), StpH2(f4B3.w, -f4B3.w)));
3766                        f = clamp(f, StpH3(-fXnyR.y, -fXnyG.y, -fXnyB.y), StpH3(fXnyR.x, fXnyG.x, fXnyB.x));
3767                    #else // defined(STP_16BIT)
3768                        fMax.r = max(StpMax3H1(f4R0.y, f4R1.x, f4R2.z), f4R3.w);
3769                        fMax.g = max(StpMax3H1(f4G0.y, f4G1.x, f4G2.z), f4G3.w);
3770                        fMax.b = max(StpMax3H1(f4B0.y, f4B1.x, f4B2.z), f4B3.w);
3771                        fMin.r = min(StpMin3H1(f4R0.y, f4R1.x, f4R2.z), f4R3.w);
3772                        fMin.g = min(StpMin3H1(f4G0.y, f4G1.x, f4G2.z), f4G3.w);
3773                        fMin.b = min(StpMin3H1(f4B0.y, f4B1.x, f4B2.z), f4B3.w);
3774                        f = clamp(f, fMin, fMax);
3775                    #endif // defined(STP_16BIT)
3776                #else // (STP_MAX_MIN_10BIT == 0)
3777                    // Leaning on {min,max} sampling so no 16/32-bit permutation.
3778                    f = clamp(f, fMin, fMax);
3779                #endif // (STP_MAX_MIN_10BIT == 0)
3780            #endif // STP_TAA_PRX_LANCZOS_DERING
3781        #endif // (STP_TAA_PRX_LANCZOS == 2)
3782//==============================================================================================================================
3783//      DISPLACEMENT
3784//==============================================================================================================================
3785        // Note the 'kJitCRcpC0' gets to position 0 to save some runtime maths.
3786        //  3 2
3787        //  0 1
3788        StpF2 oD0 = oC4 + kJitCRcpC0 - mXY;
3789        StpF2 oD1 = StpF2(kRcpC.x,      0.0) + oD0;
3790        StpF2 oD2 = StpF2(kRcpC.x, -kRcpC.y) + oD0;
3791        StpF2 oD3 = StpF2(0.0,     -kRcpC.y) + oD0;
3792        StpH3 d0 = StpTaaPriFedH(oD0).rgb;
3793        StpH3 d1 = StpTaaPriFedH(oD1).rgb;
3794        StpH3 d2 = StpTaaPriFedH(oD2).rgb;
3795        StpH3 d3 = StpTaaPriFedH(oD3).rgb;
3796//------------------------------------------------------------------------------------------------------------------------------
3797//      INDEPENDENT
3798//------------------------------------------------------------------------------------------------------------------------------
3799        // Normalize interpolation weights.
3800        #if defined(STP_16BIT)
3801            StpH2 wG2 = wG.xy + wG.zw;
3802            wG = StpSatH4(wG * StpH4_(StpPrxLoRcpH1(wG2.x + wG2.y)));
3803        #else // defined(STP_16BIT)
3804            wG = StpSatMF4(wG * StpMF4_(StpPrxLoRcpMF1(wG.x + wG.y + wG.z + wG.w)));
3805        #endif // defined(STP_16BIT)
3806//------------------------------------------------------------------------------------------------------------------------------
3807        // Temporal weighting.
3808        StpH4 wT = abs(c4R - StpH4_(f.r)) * StpH4_(STP_LUMA_R) +
3809                   abs(c4G - StpH4_(f.g)) * StpH4_(STP_LUMA_G) +
3810                   abs(c4B - StpH4_(f.b)) * StpH4_(STP_LUMA_B);
3811        wT = StpPrxLoRcpH4(wT * StpH4_(STP_ANTI_MAX) + StpH4_(STP_ANTI_MIN)) * triMask;
3812//------------------------------------------------------------------------------------------------------------------------------
3813        #if defined(STP_16BIT)
3814            StpH2 wT2 = wT.xy + wT.zw;
3815            wT = StpSatH4(wT * StpH4_(StpPrxLoRcpH1(wT2.x + wT2.y)));
3816        #else // defined(STP_16BIT)
3817            wT = StpSatMF4(wT * StpMF4_(StpPrxLoRcpMF1(wT.x + wT.y + wT.z + wT.w)));
3818        #endif // defined(STP_16BIT)
3819//------------------------------------------------------------------------------------------------------------------------------
3820        // Interpolate match.
3821        // Using a fixed 50/50 split of two normalized weights yields a normalized weight.
3822        StpH4 wM = wT * StpH4_(0.5) + wG * StpH4_(0.5);
3823        #if defined(STP_16BIT)
3824            StpH2 match2 = (c4A.xy * wM.xy) + (c4A.zw * wM.zw);
3825            StpH1 match = match2.x + match2.y;
3826        #else // defined(STP_16BIT)
3827            StpMF1 match = c4A.x * wM.x + c4A.y * wM.y + c4A.z * wM.z + c4A.w * wM.w;
3828        #endif // defined(STP_16BIT)
3829        // Non-motion-match kills convergence for this frame only.
3830        cnv *= match;
3831//------------------------------------------------------------------------------------------------------------------------------
3832//      DEPENDENT
3833//------------------------------------------------------------------------------------------------------------------------------
3834        // Interpolation, this first section doesn't have gather4, so probably no gain in swizzling.
3835        StpH3 dG = d0 * StpH3_(wG.x) + d1 * StpH3_(wG.y) + d2 * StpH3_(wG.z) + d3 * StpH3_(wG.w);
3836        StpH3 dT = d0 * StpH3_(wT.x) + d1 * StpH3_(wT.y) + d2 * StpH3_(wT.z) + d3 * StpH3_(wT.w);
3837//------------------------------------------------------------------------------------------------------------------------------
3838        #if defined(STP_16BIT)
3839            StpH2 t2R = (c4R.xy * wT.xy) + (c4R.zw * wT.zw);
3840            StpH2 t2G = (c4G.xy * wT.xy) + (c4G.zw * wT.zw);
3841            StpH2 t2B = (c4B.xy * wT.xy) + (c4B.zw * wT.zw);
3842            StpH3 t = StpH3(t2R.x + t2R.y, t2G.x + t2G.y, t2B.x + t2B.y);
3843            StpH2 c2R = (c4R.xy * wG.xy) + (c4R.zw * wG.zw);
3844            StpH2 c2G = (c4G.xy * wG.xy) + (c4G.zw * wG.zw);
3845            StpH2 c2B = (c4B.xy * wG.xy) + (c4B.zw * wG.zw);
3846            StpH3 c = StpH3(c2R.x + c2R.y, c2G.x + c2G.y, c2B.x + c2B.y);
3847        #else // defined(STP_16BIT)
3848            StpMF3 t = StpMF3(
3849                c4R.x * wT.x + c4R.y * wT.y + c4R.z * wT.z + c4R.w * wT.w,
3850                c4G.x * wT.x + c4G.y * wT.y + c4G.z * wT.z + c4G.w * wT.w,
3851                c4B.x * wT.x + c4B.y * wT.y + c4B.z * wT.z + c4B.w * wT.w);
3852            StpMF3 c = StpMF3(
3853                c4R.x * wG.x + c4R.y * wG.y + c4R.z * wG.z + c4R.w * wG.w,
3854                c4G.x * wG.x + c4G.y * wG.y + c4G.z * wG.z + c4G.w * wG.w,
3855                c4B.x * wG.x + c4B.y * wG.y + c4B.z * wG.z + c4B.w * wG.w);
3856        #endif // defined(STP_16BIT)
3857//------------------------------------------------------------------------------------------------------------------------------
3858        // Neighborhood.
3859        StpH1 bln = StpSatH1(cnv * StpPrxLoRcpH1(cnv + StpH1_(1.0 / STP_FRAME_MAX)));
3860        StpH1 blnT = StpH1_(1.0) - bln;
3861        StpH3 b = f * StpH3_(bln) + t * StpH3_(blnT);
3862        StpH3 minNe = min(c, b);
3863        StpH3 maxNe = max(c, b);
3864//------------------------------------------------------------------------------------------------------------------------------
3865        // Apply pen.
3866        StpH3 penC = StpSatH3(c + (f - dG) * StpH3_(StpH1_(0.9875) * match));
3867        StpH2 penWF;
3868        penWF.x = pen * StpH1_(STP_TAA_PEN_W);
3869        penWF.y = pen * lerp(StpH1_(STP_TAA_PEN_F0), StpH1_(STP_TAA_PEN_F1), cnv);
3870        StpH2 penNotWF = StpH2_(1.0) - penWF;
3871        rF.rgb = t + (f - dT);
3872        rF.rgb = rF.rgb * StpH3_(blnT) + f * StpH3_(bln);
3873        rW.rgb = StpSatH3(rF.rgb * StpH3_(penNotWF.x) + penC * StpH3_(penWF.x));
3874        rF.rgb = StpSatH3(rF.rgb * StpH3_(penNotWF.y) + penC * StpH3_(penWF.y));
3875        rW.rgb = clamp(rW.rgb, minNe, maxNe);
3876        rF.rgb = clamp(rF.rgb, minNe, maxNe);
3877//------------------------------------------------------------------------------------------------------------------------------
3878        // Get back into linear, and then HDR.
3879        rW.rgb *= rW.rgb;
3880        #if (STP_POSTMAP == 0)
3881            StpToneInvH3(rW.rgb);
3882        #endif // (STP_POSTMAP == 0)
3883        // Alpha is currently unused, this might improve compression (vs undefined).
3884        rF.a = rW.a = StpH1(0.0); }
3885#endif // defined(STP_GPU) && defined(STP_TAA) && defined(STP_16BIT)
3886////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3887////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3888////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3889////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3890//_____________________________________________________________.._______________________________________________________________
3891//==============================================================================================================================
3892//
3893//                                                GOOD ENOUGH ANTI-ALIASING [GEAA]
3894//
3895//------------------------------------------------------------------------------------------------------------------------------
3896// Yet another simplified spatial morphological AA.
3897// Not perfect, but it has low complexity (one pass), and is good enough for a TAA override.
3898// Fails on longer edges (due to low maximum search), doesn't get diagonals perfect.
3899// But good on already part AA'ed inputs.
3900// The spatial AA is not used in STP, only a weighting value which is later used to guide a quick-and-dirty scalar.
3901// With some modification this could be used for spatial AA, with or without scaling.
3902//------------------------------------------------------------------------------------------------------------------------------
3903// CALLBACKS
3904// =========
3905// StpMF4 StpGeaa4F(StpF2 p) - Gather4 of luma (or green as luma).
3906// ---------
3907// StpH4 StpGeaa4H(StpF2 p)
3908//==============================================================================================================================
3909////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3910////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3911//_____________________________________________________________.._______________________________________________________________
3912//==============================================================================================================================
3913//                                                      [GEAA] DEFAULTS
3914//==============================================================================================================================
3915// Choose a configuration of number of positions to sample.
3916//  0 ... 3 per side (faster, less quality)
3917//  1 ... 5 per side
3918//  2 ... 7 per side
3919//  3 ... 9 per side (slower, higher quality)
3920#ifndef STP_GEAA_P
3921    #define STP_GEAA_P 3
3922#endif // STP_GEAA_P
3923//------------------------------------------------------------------------------------------------------------------------------
3924// Amount of sub-pixel blur.
3925//  0.50 ... Turn it off
3926//  0.25 ... Middle ground
3927//  0.00 ... More blur
3928#ifndef STP_GEAA_SUBPIX
3929    #define STP_GEAA_SUBPIX (8.0 / 16.0)
3930#endif // STP_GEAA_SUBPIX
3931////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3932////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3933//_____________________________________________________________.._______________________________________________________________
3934//==============================================================================================================================
3935//                                                  [GEAA] INTERNAL TUNING
3936//==============================================================================================================================
3937// Higher numbers can reduce the amount of AA, lower numbers can increase it but can look dirty.
3938// Best not to mess with this, 1/3 is the 'correct' value for 2 of the 3 edge cases.
3939#define STP_GEAA_THRESHOLD (1.0/3.0)
3940////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3941////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3942//_____________________________________________________________.._______________________________________________________________
3943//==============================================================================================================================
3944//                                                  [GEAA] 32-BIT ENTRY POINT
3945//==============================================================================================================================
3946// See the 16-bit version for all comments.
3947#if defined(STP_GPU) && defined(STP_GEAA) && defined(STP_32BIT)
3948    void StpGeaaF(
3949    out StpMF1 gW, out StpMF1 gLuma, out StpF2 gFilter, out StpF2 gDilate, StpF2 p, StpF2 kRcpI, StpF2 kHalfRcpI) {
3950//------------------------------------------------------------------------------------------------------------------------------
3951        #if STP_OFFSETS
3952            StpF2 pDEBA = p + StpF2(-kHalfRcpI.x, -kHalfRcpI.y);
3953            StpMF4 gDEBA = StpGeaa4F(pDEBA);
3954            StpMF4 gEFCB = StpGeaa4OF(pDEBA, StpI2(1, 0));
3955            StpMF4 gGHED = StpGeaa4OF(pDEBA, StpI2(0, 1));
3956            StpMF4 gHIFE = StpGeaa4OF(pDEBA, StpI2(1, 1));
3957        #else // STP_OFFSETS
3958            StpMF4 gDEBA = StpGeaa4F(p + StpF2(-kHalfRcpI.x, -kHalfRcpI.y));
3959            StpMF4 gEFCB = StpGeaa4F(p + StpF2( kHalfRcpI.x, -kHalfRcpI.y));
3960            StpMF4 gGHED = StpGeaa4F(p + StpF2(-kHalfRcpI.x,  kHalfRcpI.y));
3961            StpMF4 gHIFE = StpGeaa4F(p + StpF2( kHalfRcpI.x,  kHalfRcpI.y));
3962        #endif // STP_OFFSETS
3963//------------------------------------------------------------------------------------------------------------------------------
3964        StpMF2 gHV0,gHV1,gHV2;
3965        gHV0.x = gDEBA.z * StpMF1_(-2.0) + gEFCB.z;
3966        gHV0.y = gDEBA.x * StpMF1_(-2.0) + gGHED.x;
3967        gHV0 += StpMF2_(gDEBA.w);
3968        gHV1.x = gDEBA.x + gEFCB.y;
3969        gHV1.y = gDEBA.z + gGHED.y;
3970        gHV1 += StpMF2_(gDEBA.y) * StpMF2_(-2.0);
3971        gHV2.x = gGHED.x + gGHED.y * StpMF1_(-2.0);
3972        gHV2.y = gEFCB.z + gEFCB.y * StpMF1_(-2.0);
3973        gHV2 += StpMF2_(gHIFE.y);
3974        #if 0
3975            StpMF2 gHV = abs(gHV0) + abs(gHV1) * StpMF2_(2.0) + abs(gHV2);
3976        #else
3977            StpMF2 gHV = gHV0 * gHV0 + gHV1 * gHV1 * StpMF2_(2.0) + gHV2 * gHV2;
3978        #endif
3979        StpP1 gVert = gHV.x > gHV.y;
3980//------------------------------------------------------------------------------------------------------------------------------
3981        StpMF2 gBH = gVert ? StpMF2(gDEBA.x, gEFCB.y) : StpMF2(gDEBA.z, gGHED.y);
3982        StpMF2 gAC = gVert ? StpMF2(gDEBA.w, gGHED.x) : StpMF2(gDEBA.w, gEFCB.z);
3983        StpMF2 gDF = gVert ? StpMF2(gDEBA.z, gGHED.y) : StpMF2(gDEBA.x, gEFCB.y);
3984        StpMF2 gGI = gVert ? StpMF2(gEFCB.y, gHIFE.y) : StpMF2(gGHED.x, gHIFE.y);
3985        StpMF2 gBHMinusE = gBH - StpMF2_(gDEBA.y);
3986        StpMF2 gEnd2 = abs(gBHMinusE);
3987        StpP1 gUp = gEnd2.x >= gEnd2.y;
3988//------------------------------------------------------------------------------------------------------------------------------
3989        StpMF1 gE = gDEBA.y;
3990        gBH = gUp ? gBH : gBH.yx;
3991//------------------------------------------------------------------------------------------------------------------------------
3992        StpMF2 gBi = gUp ? StpMF2(2.0 / 3.0, 1.0 / 3.0) : StpMF2(1.0 / 3.0 , 2.0 / 3.0);
3993        StpMF1 gBMinusE = gUp ? gBHMinusE.x : gBHMinusE.y;
3994        StpMF2 gBi0 = (gUp ? gAC : gGI) * StpMF2_(1.0 / 3.0) + gDF * StpMF2_(2.0 / 3.0);
3995        StpMF2 gLo0 = gDF;
3996        StpMF1 gAbsBMinusE = abs(gBMinusE);
3997        StpMF1 gNe = gAbsBMinusE;
3998        StpMF1 gGood = StpGtZeroMF1(gBMinusE);
3999//------------------------------------------------------------------------------------------------------------------------------
4000        StpF2 gWalk = gVert ? StpF2(0.0, kRcpI.y) : StpF2(kRcpI.x, 0.0);
4001        StpF2 gDecon = gVert ? StpF2(kRcpI.x, 0.0) : StpF2(0.0, kRcpI.y);
4002        if(gUp) gDecon = -gDecon;
4003//------------------------------------------------------------------------------------------------------------------------------
4004        StpF2 gP = p + gDecon * StpF2_(1.0/3.0);
4005//------------------------------------------------------------------------------------------------------------------------------
4006        StpF2 gPN3 = gP - StpF2_(8.5) * gWalk;
4007        StpF2 gPN2 = gP - StpF2_(6.5) * gWalk;
4008        StpF2 gPN1 = gP - StpF2_(4.5) * gWalk;
4009        StpF2 gPN0 = gP - StpF2_(2.5) * gWalk;
4010        StpF2 gPP0 = gP + StpF2_(2.5) * gWalk;
4011        StpF2 gPP1 = gP + StpF2_(4.5) * gWalk;
4012        StpF2 gPP2 = gP + StpF2_(6.5) * gWalk;
4013        StpF2 gPP3 = gP + StpF2_(8.5) * gWalk;
4014//------------------------------------------------------------------------------------------------------------------------------
4015        StpMF4 gGN3, gGN2, gGN1, gGN0, gGP0, gGP1, gGP2, gGP3;
4016        gGN3 = StpGeaa4F(gPN3);
4017        gGN2 = StpGeaa4F(gPN2);
4018        gGN1 = StpGeaa4F(gPN1);
4019        gGN0 = StpGeaa4F(gPN0);
4020        gGP0 = StpGeaa4F(gPP0);
4021        gGP1 = StpGeaa4F(gPP1);
4022        gGP2 = StpGeaa4F(gPP2);
4023        gGP3 = StpGeaa4F(gPP3);
4024//------------------------------------------------------------------------------------------------------------------------------
4025        if(gVert) {
4026            gGN3 = gGN3.zyxw;
4027            gGN2 = gGN2.zyxw;
4028            gGN1 = gGN1.zyxw;
4029            gGN0 = gGN0.zyxw;
4030            gGP0 = gGP0.zyxw;
4031            gGP1 = gGP1.zyxw;
4032            gGP2 = gGP2.zyxw;
4033            gGP3 = gGP3.zyxw; }
4034//------------------------------------------------------------------------------------------------------------------------------
4035        StpMF2 gLo8 = StpMF2(gGN3.x, gGP3.y);
4036        StpMF2 gLo7 = StpMF2(gGN3.y, gGP3.x);
4037        StpMF2 gLo6 = StpMF2(gGN2.x, gGP2.y);
4038        StpMF2 gLo5 = StpMF2(gGN2.y, gGP2.x);
4039        StpMF2 gLo4 = StpMF2(gGN1.x, gGP1.y);
4040        StpMF2 gLo3 = StpMF2(gGN1.y, gGP1.x);
4041        StpMF2 gLo2 = StpMF2(gGN0.x, gGP0.y);
4042        StpMF2 gLo1 = StpMF2(gGN0.y, gGP0.x);
4043        if(!gUp) {
4044            gLo8 = StpMF2(gGN3.w, gGP3.z);
4045            gLo7 = StpMF2(gGN3.z, gGP3.w);
4046            gLo6 = StpMF2(gGN2.w, gGP2.z);
4047            gLo5 = StpMF2(gGN2.z, gGP2.w);
4048            gLo4 = StpMF2(gGN1.w, gGP1.z);
4049            gLo3 = StpMF2(gGN1.z, gGP1.w);
4050            gLo2 = StpMF2(gGN0.w, gGP0.z);
4051            gLo1 = StpMF2(gGN0.z, gGP0.w); }
4052//------------------------------------------------------------------------------------------------------------------------------
4053        StpMF2 gGN3Bi = gGN3.yx * StpMF2_(gBi.x) + gGN3.zw * StpMF2_(gBi.y);
4054        StpMF2 gGN2Bi = gGN2.yx * StpMF2_(gBi.x) + gGN2.zw * StpMF2_(gBi.y);
4055        StpMF2 gGN1Bi = gGN1.yx * StpMF2_(gBi.x) + gGN1.zw * StpMF2_(gBi.y);
4056        StpMF2 gGN0Bi = gGN0.yx * StpMF2_(gBi.x) + gGN0.zw * StpMF2_(gBi.y);
4057        StpMF2 gGP0Bi = gGP0.yx * StpMF2_(gBi.x) + gGP0.zw * StpMF2_(gBi.y);
4058        StpMF2 gGP1Bi = gGP1.yx * StpMF2_(gBi.x) + gGP1.zw * StpMF2_(gBi.y);
4059        StpMF2 gGP2Bi = gGP2.yx * StpMF2_(gBi.x) + gGP2.zw * StpMF2_(gBi.y);
4060        StpMF2 gGP3Bi = gGP3.yx * StpMF2_(gBi.x) + gGP3.zw * StpMF2_(gBi.y);
4061        StpMF2 gBi8 = StpMF2(gGN3Bi.y, gGP3Bi.x);
4062        StpMF2 gBi7 = StpMF2(gGN3Bi.x, gGP3Bi.y);
4063        StpMF2 gBi6 = StpMF2(gGN2Bi.y, gGP2Bi.x);
4064        StpMF2 gBi5 = StpMF2(gGN2Bi.x, gGP2Bi.y);
4065        StpMF2 gBi4 = StpMF2(gGN1Bi.y, gGP1Bi.x);
4066        StpMF2 gBi3 = StpMF2(gGN1Bi.x, gGP1Bi.y);
4067        StpMF2 gBi2 = StpMF2(gGN0Bi.y, gGP0Bi.x);
4068        StpMF2 gBi1 = StpMF2(gGN0Bi.x, gGP0Bi.y);
4069//------------------------------------------------------------------------------------------------------------------------------
4070        StpMF2 gEndBase;
4071        gEndBase.y = gBMinusE * StpMF1_(1.0/3.0) + gE;
4072        gEndBase.x = gAbsBMinusE * StpMF1_(STP_GEAA_THRESHOLD);
4073        #if 0
4074            gEndBase.x = StpRcpMF1(max(StpMF1_(1.0 / 16384.0), gEndBase.x));
4075        #else
4076            gEndBase.x = StpPrxLoRcpMF1(gEndBase.x);
4077        #endif
4078//------------------------------------------------------------------------------------------------------------------------------
4079        #if (STP_GEAA_P > 2)
4080            StpMF2 gUseP8 = StpSatMF2(abs(gBi8 - StpMF2_(gEndBase.y)) * StpMF2_(gEndBase.x));
4081            StpMF2 gUseP7 = StpSatMF2(abs(gBi7 - StpMF2_(gEndBase.y)) * StpMF2_(gEndBase.x));
4082        #endif
4083        #if (STP_GEAA_P > 1)
4084            StpMF2 gUseP6 = StpSatMF2(abs(gBi6 - StpMF2_(gEndBase.y)) * StpMF2_(gEndBase.x));
4085            StpMF2 gUseP5 = StpSatMF2(abs(gBi5 - StpMF2_(gEndBase.y)) * StpMF2_(gEndBase.x));
4086        #endif
4087        #if (STP_GEAA_P > 0)
4088            StpMF2 gUseP4 = StpSatMF2(abs(gBi4 - StpMF2_(gEndBase.y)) * StpMF2_(gEndBase.x));
4089            StpMF2 gUseP3 = StpSatMF2(abs(gBi3 - StpMF2_(gEndBase.y)) * StpMF2_(gEndBase.x));
4090        #endif
4091            StpMF2 gUseP2 = StpSatMF2(abs(gBi2 - StpMF2_(gEndBase.y)) * StpMF2_(gEndBase.x));
4092            StpMF2 gUseP1 = StpSatMF2(abs(gBi1 - StpMF2_(gEndBase.y)) * StpMF2_(gEndBase.x));
4093            StpMF2 gUseP0 = StpSatMF2(abs(gBi0 - StpMF2_(gEndBase.y)) * StpMF2_(gEndBase.x));
4094//------------------------------------------------------------------------------------------------------------------------------
4095        #if (STP_GEAA_P == 3)
4096            StpMF2 gDst2 = StpMF2_(9.5);
4097        #endif
4098        #if (STP_GEAA_P == 2)
4099            StpMF2 gDst2 = StpMF2_(7.5);
4100        #endif
4101        #if (STP_GEAA_P == 1)
4102            StpMF2 gDst2 = StpMF2_(5.5);
4103        #endif
4104        #if (STP_GEAA_P == 0)
4105            StpMF2 gDst2 = StpMF2_(3.5);
4106        #endif
4107        #if (STP_GEAA_P > 2)
4108            gDst2 = gDst2 + (StpMF2_(8.5) - gDst2) * gUseP8;
4109            gDst2 = gDst2 + (StpMF2_(7.5) - gDst2) * gUseP7;
4110        #endif
4111        #if (STP_GEAA_P > 1)
4112            gDst2 = gDst2 + (StpMF2_(6.5) - gDst2) * gUseP6;
4113            gDst2 = gDst2 + (StpMF2_(5.5) - gDst2) * gUseP5;
4114        #endif
4115        #if (STP_GEAA_P > 0)
4116            gDst2 = gDst2 + (StpMF2_(4.5) - gDst2) * gUseP4;
4117            gDst2 = gDst2 + (StpMF2_(3.5) - gDst2) * gUseP3;
4118        #endif
4119            gDst2 = gDst2 + (StpMF2_(2.5) - gDst2) * gUseP2;
4120            gDst2 = gDst2 + (StpMF2_(1.5) - gDst2) * gUseP1;
4121            gDst2 = gDst2 + (StpMF2_(0.5) - gDst2) * gUseP0;
4122//------------------------------------------------------------------------------------------------------------------------------
4123        StpMF1 gLoSub = (gDst2.x + gDst2.y) * StpMF1_(0.5) - StpMF1_(STP_GEAA_SUBPIX);
4124        StpMF2 gLoW01 = StpMF2_(1.0) - StpSatMF2(StpMF2(1.0, 2.0) - StpMF2_(gLoSub));
4125        StpMF2 gLoW23 = StpMF2_(1.0) - StpSatMF2(StpMF2(3.0, 4.0) - StpMF2_(gLoSub));
4126        StpMF2 gLoW45 = StpMF2_(1.0) - StpSatMF2(StpMF2(5.0, 6.0) - StpMF2_(gLoSub));
4127        StpMF2 gLoW67 = StpMF2_(1.0) - StpSatMF2(StpMF2(7.0, 8.0) - StpMF2_(gLoSub));
4128        StpMF2 gLoW89 = StpMF2_(1.0) - StpSatMF2(StpMF2(9.0,10.0) - StpMF2_(gLoSub));
4129        StpMF2 gLoAcc2 =
4130            gLo0 * StpMF2_(gLoW01.x) +
4131            gLo1 * StpMF2_(gLoW01.y) +
4132            gLo2 * StpMF2_(gLoW23.x) +
4133            gLo3 * StpMF2_(gLoW23.y) +
4134            gLo4 * StpMF2_(gLoW45.x) +
4135            gLo5 * StpMF2_(gLoW45.y) +
4136            gLo6 * StpMF2_(gLoW67.x) +
4137            gLo7 * StpMF2_(gLoW67.y) +
4138            gLo8 * StpMF2_(gLoW89.x);
4139        StpMF1 gLoAcc = gE + gLoAcc2.x + gLoAcc2.y;
4140        StpMF2 gLoW2 = gLoW01 + gLoW23 + gLoW45 + gLoW67;
4141        gLoW2 *= StpMF2_(2.0);
4142        gLoAcc *= StpRcpMF1(StpMF1_(1.0) + gLoW89.x * StpMF1_(2.0) + gLoW2.x + gLoW2.y);
4143        StpMF1 gOff = StpSatMF1((gLoAcc - gE) * StpRcpMF1(gBH.x - gE));
4144        gOff = min(gOff, StpMF1_(0.5));
4145//------------------------------------------------------------------------------------------------------------------------------
4146        gDilate = p + gDecon;
4147        gFilter = p + gDecon * StpF2_(gOff);
4148        gLuma = lerp(gE, gBH.x, gOff);
4149//------------------------------------------------------------------------------------------------------------------------------
4150        StpMF1 gAnti = lerp(gE, gBH.x, gOff);
4151        StpMF1 gT = StpSatMF1((StpMF1_(-2.0) * gAnti + gBH.x + gE) * StpRcpMF1(gE - gBH.y));
4152        StpMF1 gFix = gE * (gT - StpMF1_(1.0)) - gBH.y * gT;
4153        gFix = StpSatMF1((gFix + gAnti) * StpRcpMF1(gFix + gBH.x));
4154//------------------------------------------------------------------------------------------------------------------------------
4155        gW = gFix;
4156        gW = StpRcpMF1(gW + StpMF1_(0.5)) - StpMF1_(1.0);
4157        gW *= gW;
4158        gW = max(gW, StpMF1_(1.0/255.0)); }
4159#endif // defined(STP_GPU) && defined(STP_GEAA) && defined(STP_32BIT)
4160////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
4161////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
4162//_____________________________________________________________.._______________________________________________________________
4163//==============================================================================================================================
4164//                                               [GEAA] PACKED 16-BIT ENTRY POINT
4165//==============================================================================================================================
4166#if defined(STP_GPU) && defined(STP_GEAA) && defined(STP_16BIT)
4167    void StpGeaaH(
4168    out StpH1 gW,      // Output weight for pixel art scalar.
4169    out StpH1 gLuma,   // Filtered luma for debug.
4170    out StpF2 gFilter, // Location to sample for standalone unscaled spatial AA.
4171    out StpF2 gDilate, // Location of highest contrast neighbor.
4172    StpF2 p,           // {0 to 1} position across screen.
4173    StpF2 kRcpI,       // 1.0 / input image size in pixels.
4174    StpF2 kHalfRcpI) { // 0.5 / input image size in pixels.
4175//------------------------------------------------------------------------------------------------------------------------------
4176        // Sample 3x3 input pattern in luma (or green).
4177        //  A B C
4178        //  D E F
4179        //  G H I
4180        // Via four gather4s, usage for the next section to try to improve operand caching.
4181        #if STP_OFFSETS
4182            StpF2 pDEBA = p + StpF2(-kHalfRcpI.x, -kHalfRcpI.y);
4183            StpH4 gDEBA = StpGeaa4H(pDEBA);
4184            StpH4 gEFCB = StpGeaa4OH(pDEBA, StpI2(1, 0));
4185            StpH4 gGHED = StpGeaa4OH(pDEBA, StpI2(0, 1));
4186            StpH4 gHIFE = StpGeaa4OH(pDEBA, StpI2(1, 1));
4187        #else // STP_OFFSETS
4188            StpH4 gDEBA = StpGeaa4H(p + StpF2(-kHalfRcpI.x, -kHalfRcpI.y)); // .xyzw=DEBA
4189            StpH4 gEFCB = StpGeaa4H(p + StpF2( kHalfRcpI.x, -kHalfRcpI.y)); // .yz  =FC
4190            StpH4 gGHED = StpGeaa4H(p + StpF2(-kHalfRcpI.x,  kHalfRcpI.y)); // .xy  =GH
4191            StpH4 gHIFE = StpGeaa4H(p + StpF2( kHalfRcpI.x,  kHalfRcpI.y)); // .y   =I
4192        #endif // STP_OFFSETS
4193//------------------------------------------------------------------------------------------------------------------------------
4194        // Compute {horz,vert} change terms. Complex to decide on either horizontal or vertical direction.
4195        // Trouble case for some algorithms,
4196        //  0 1 0
4197        //  0 1 0
4198        //  0 1 0
4199        // This should present as a vertical search direction.
4200        // Simple stuff like sum of each 2x2 produces,
4201        //  2 2
4202        //  2 2
4203        // Which has no direction.
4204        // {ABC,ADG}
4205        StpH2 gHV0,gHV1,gHV2;
4206        gHV0.x = gDEBA.z * StpH1_(-2.0) + gEFCB.z;
4207        gHV0.y = gDEBA.x * StpH1_(-2.0) + gGHED.x;
4208        gHV0 += StpH2_(gDEBA.w);
4209        // {DEF,BEH}
4210        gHV1.x = gDEBA.x + gEFCB.y;
4211        gHV1.y = gDEBA.z + gGHED.y;
4212        gHV1 += StpH2_(gDEBA.y) * StpH2_(-2.0);
4213        // {GHI,CFI}
4214        gHV2.x = gGHED.x + gGHED.y * StpH1_(-2.0);
4215        gHV2.y = gEFCB.z + gEFCB.y * StpH1_(-2.0);
4216        gHV2 += StpH2_(gHIFE.y);
4217        // Combine terms.
4218        #if 0
4219            // What FXAA does, better for a diagonal computation (which is not needed), left for reference.
4220            StpH2 gHV = abs(gHV0) + abs(gHV1) * StpH2_(2.0) + abs(gHV2);
4221        #else
4222            // Slightly faster for packed 16-bit (which has no free ABS on AMD).
4223            StpH2 gHV = gHV0 * gHV0 + gHV1 * gHV1 * StpH2_(2.0) + gHV2 * gHV2;
4224        #endif
4225        // Choose search direction, the 'gVert' is true:=vert, false:=horz.
4226        // Go vertical search if horizontal has higher contrast (search perpendicular).
4227        StpP1 gVert = gHV.x > gHV.y;
4228//------------------------------------------------------------------------------------------------------------------------------
4229        // This is BH if search horzontal, else DF (as BH) if search vertical.
4230        StpH2 gBH = gVert ? StpH2(gDEBA.x, gEFCB.y) : StpH2(gDEBA.z, gGHED.y);
4231        // Will need these later, will let the compiler move around the transpose.
4232        StpH2 gAC = gVert ? StpH2(gDEBA.w, gGHED.x) : StpH2(gDEBA.w, gEFCB.z);
4233        StpH2 gDF = gVert ? StpH2(gDEBA.z, gGHED.y) : StpH2(gDEBA.x, gEFCB.y);
4234        StpH2 gGI = gVert ? StpH2(gEFCB.y, gHIFE.y) : StpH2(gGHED.x, gHIFE.y);
4235        // Start to compute threshold for end of span, compute a gradient pair.
4236        StpH2 gBHMinusE = gBH - StpH2_(gDEBA.y);
4237        StpH2 gEnd2 = abs(gBHMinusE);
4238        // If gradient is larger upward (or leftward if vert).
4239        StpP1 gUp = gEnd2.x >= gEnd2.y;
4240//------------------------------------------------------------------------------------------------------------------------------
4241        // Rename.
4242        StpH1 gE = gDEBA.y;
4243        // Swap if not up. From this point on, the B is the high-contrast neighbor, and the H is the other one in same dir.
4244        gBH = gUp ? gBH : gBH.yx;
4245//------------------------------------------------------------------------------------------------------------------------------
4246        // Choose the bilinear scalar (gets to 1/3 between texels during the search).
4247        //  .x ... For texel closer to pixel axis when up (reversed when down).
4248        //  .y ... For more distant texel.
4249        // LOGIC
4250        // =====
4251        // This keeps threshold of 2 of the 3 end conditions the same (so 1/3 shift is better than 1/4).
4252        // =====
4253        //  e         e    e   <- e = end cases
4254        //  0    0    1    1   <- 1/3 of high contrast neighbor
4255        //  0    1    0    1   <- 2/3 of self
4256        // ------------------
4257        //  0   2/3  1/3   1   <- blended value (2/3 is the target)
4258        // 2/3   0   1/3  1/3  <- abs(difference to target)
4259        StpH2 gBi = gUp ? StpH2(2.0 / 3.0, 1.0 / 3.0) : StpH2(1.0 / 3.0 , 2.0 / 3.0);
4260        // Choose either {B-E, or H-E}.
4261        StpH1 gBMinusE = gUp ? gBHMinusE.x : gBHMinusE.y;
4262        // Finish Bi0, this is the first 2 texture fetches (done using math instead) at P0 (1 texel away from center).
4263        StpH2 gBi0 = (gUp ? gAC : gGI) * StpH2_(1.0 / 3.0) + gDF * StpH2_(2.0 / 3.0);
4264        // Finish Lo0, for the directional blur.
4265        StpH2 gLo0 = gDF;
4266        // Store out spatial neighborhood.
4267        StpH1 gAbsBMinusE = abs(gBMinusE);
4268        // This is just the highest contrast neighbor along the choosen direction, may report less contrast then actual.
4269        StpH1 gNe = gAbsBMinusE;
4270        // Good direction to compare against at the end.
4271        // Good means 'don't flip' to the other side.
4272        // Have 'B-E' want 'signed(E-(B/2+E/2))' = 'signed(E/2-B/2)' = 'signed(E-B)' = 'gtzero(B-E)'
4273        StpH1 gGood = StpGtZeroH1(gBMinusE);
4274//------------------------------------------------------------------------------------------------------------------------------
4275        // One pixel walk distance for search.
4276        StpF2 gWalk = gVert ? StpF2(0.0, kRcpI.y) : StpF2(kRcpI.x, 0.0);
4277        // This is the direction of decontrast (towards the highest contrast neighbor).
4278        StpF2 gDecon = gVert ? StpF2(kRcpI.x, 0.0) : StpF2(0.0, kRcpI.y);
4279        // If up (or left) work negative.
4280        if(gUp) gDecon = -gDecon;
4281//------------------------------------------------------------------------------------------------------------------------------
4282        // Have enough now to build out sampling positions.
4283        // This works in gather4 to get two samples per gather, then uses math to finish the bilinear fetch.
4284        // In case the logic ever goes back to a non-gather4 version, this keeps with the 1/3 offset.
4285        // Build base, 1/3 to neighbor pixel.
4286        // It must be 1/3 to neighbor pixel to be able to find the end of thin stuff like this.
4287        //  . . . . . . . . . . .
4288        //  . . . . . . x x x x x
4289        //  . x x x x x . . . . .
4290        //      |       |
4291        //      |------>|
4292        //              |                             .     x
4293        //            If it was 1/2 to neighbor, then x and . would look the same.
4294        StpF2 gP = p + gDecon * StpF2_(1.0/3.0);
4295        // The gather4 positions are (assuming horizontal then up).
4296        //  3 3 2 2 1 1 0 0 A B C 0 0 1 1 2 2 3 3
4297        //  3 3 2 2 1 1 0 0 D E F 0 0 1 1 2 2 3 3
4298        //                  G H I
4299//------------------------------------------------------------------------------------------------------------------------------
4300        // Sampling positions.
4301        // Currently walking without gaps, but could skip along too!
4302        StpF2 gPN3 = gP - StpF2_(8.5) * gWalk;
4303        StpF2 gPN2 = gP - StpF2_(6.5) * gWalk;
4304        StpF2 gPN1 = gP - StpF2_(4.5) * gWalk;
4305        StpF2 gPN0 = gP - StpF2_(2.5) * gWalk;
4306        StpF2 gPP0 = gP + StpF2_(2.5) * gWalk;
4307        StpF2 gPP1 = gP + StpF2_(4.5) * gWalk;
4308        StpF2 gPP2 = gP + StpF2_(6.5) * gWalk;
4309        StpF2 gPP3 = gP + StpF2_(8.5) * gWalk;
4310//------------------------------------------------------------------------------------------------------------------------------
4311        // This attempts to do sampling in a cache friendly way.
4312        // Cannot sample with offsets, because it could be vertical or horizontal and offsets need to be static in DX.
4313        // Sampling pairs {negative, positive} directions.
4314        StpH4 gGN3, gGN2, gGN1, gGN0, gGP0, gGP1, gGP2, gGP3;
4315        gGN3 = StpGeaa4H(gPN3);
4316        gGN2 = StpGeaa4H(gPN2);
4317        gGN1 = StpGeaa4H(gPN1);
4318        gGN0 = StpGeaa4H(gPN0);
4319        gGP0 = StpGeaa4H(gPP0);
4320        gGP1 = StpGeaa4H(gPP1);
4321        gGP2 = StpGeaa4H(gPP2);
4322        gGP3 = StpGeaa4H(gPP3);
4323//------------------------------------------------------------------------------------------------------------------------------
4324        // Finish the bilinear fetch.
4325        // For 'vertical' this needs to do a transpose.
4326        // The FMAs are duplicated, else the compiler would need to do that anyway.
4327        //                             1st 2nd for N side (P side is reversed)
4328        //  -----------                  | |
4329        //  W Z     w z  !vert &  up ... Y X, Z W
4330        //  X Y [p] x y
4331        //  -----------
4332        //  W Z [p] w z  !vert & !up ... Z W, Y X
4333        //  X Y     x y
4334        //  -----------
4335        //  W Z           vert &  up ... Y Z, X W
4336        //  X Y
4337        //   [p]
4338        //  w z
4339        //  x y
4340        //  -----------
4341        //    W Z         vert & !up ... X W, Y Z
4342        //    X Y                        | |  | |
4343        //   [p]                         | |  0.33 term
4344        //    w z                        | |
4345        //    x y                        0.66 term
4346        //  -----------
4347        if(gVert) {
4348            gGN3 = gGN3.zyxw;
4349            gGN2 = gGN2.zyxw;
4350            gGN1 = gGN1.zyxw;
4351            gGN0 = gGN0.zyxw;
4352            gGP0 = gGP0.zyxw;
4353            gGP1 = gGP1.zyxw;
4354            gGP2 = gGP2.zyxw;
4355            gGP3 = gGP3.zyxw; }
4356//------------------------------------------------------------------------------------------------------------------------------
4357        // Grab the texels for the variable length inline low-pass box blur.
4358        StpH2 gLo8 = StpH2(gGN3.x, gGP3.y);
4359        StpH2 gLo7 = StpH2(gGN3.y, gGP3.x);
4360        StpH2 gLo6 = StpH2(gGN2.x, gGP2.y);
4361        StpH2 gLo5 = StpH2(gGN2.y, gGP2.x);
4362        StpH2 gLo4 = StpH2(gGN1.x, gGP1.y);
4363        StpH2 gLo3 = StpH2(gGN1.y, gGP1.x);
4364        StpH2 gLo2 = StpH2(gGN0.x, gGP0.y);
4365        StpH2 gLo1 = StpH2(gGN0.y, gGP0.x);
4366        if(!gUp) {
4367            gLo8 = StpH2(gGN3.w, gGP3.z);
4368            gLo7 = StpH2(gGN3.z, gGP3.w);
4369            gLo6 = StpH2(gGN2.w, gGP2.z);
4370            gLo5 = StpH2(gGN2.z, gGP2.w);
4371            gLo4 = StpH2(gGN1.w, gGP1.z);
4372            gLo3 = StpH2(gGN1.z, gGP1.w);
4373            gLo2 = StpH2(gGN0.w, gGP0.z);
4374            gLo1 = StpH2(gGN0.z, gGP0.w); }
4375//------------------------------------------------------------------------------------------------------------------------------
4376        // Simulate the bilinear fetch.
4377        StpH2 gGN3Bi = gGN3.yx * StpH2_(gBi.x) + gGN3.zw * StpH2_(gBi.y);
4378        StpH2 gGN2Bi = gGN2.yx * StpH2_(gBi.x) + gGN2.zw * StpH2_(gBi.y);
4379        StpH2 gGN1Bi = gGN1.yx * StpH2_(gBi.x) + gGN1.zw * StpH2_(gBi.y);
4380        StpH2 gGN0Bi = gGN0.yx * StpH2_(gBi.x) + gGN0.zw * StpH2_(gBi.y);
4381        StpH2 gGP0Bi = gGP0.yx * StpH2_(gBi.x) + gGP0.zw * StpH2_(gBi.y);
4382        StpH2 gGP1Bi = gGP1.yx * StpH2_(gBi.x) + gGP1.zw * StpH2_(gBi.y);
4383        StpH2 gGP2Bi = gGP2.yx * StpH2_(gBi.x) + gGP2.zw * StpH2_(gBi.y);
4384        StpH2 gGP3Bi = gGP3.yx * StpH2_(gBi.x) + gGP3.zw * StpH2_(gBi.y);
4385        // Note positive side the {x,y} order is reversed.
4386        StpH2 gBi8 = StpH2(gGN3Bi.y, gGP3Bi.x);
4387        StpH2 gBi7 = StpH2(gGN3Bi.x, gGP3Bi.y);
4388        StpH2 gBi6 = StpH2(gGN2Bi.y, gGP2Bi.x);
4389        StpH2 gBi5 = StpH2(gGN2Bi.x, gGP2Bi.y);
4390        StpH2 gBi4 = StpH2(gGN1Bi.y, gGP1Bi.x);
4391        StpH2 gBi3 = StpH2(gGN1Bi.x, gGP1Bi.y);
4392        StpH2 gBi2 = StpH2(gGN0Bi.y, gGP0Bi.x);
4393        StpH2 gBi1 = StpH2(gGN0Bi.x, gGP0Bi.y);
4394//------------------------------------------------------------------------------------------------------------------------------
4395        // Threshold for end of span (X), and base to compare against (Y).
4396        StpH2 gEndBase;
4397        // For a (1.0/3.0) pixel shift.
4398        // The 'gBMinusE = other - self', and want 'self * (2.0/3.0) + other * (1.0/3.0)'.
4399        gEndBase.y = gBMinusE * StpH1_(1.0/3.0) + gE;
4400        gEndBase.x = gAbsBMinusE * StpH1_(STP_GEAA_THRESHOLD);
4401        // Safer version here for reference.
4402        #if 0
4403            gEndBase.x = StpRcpH1(max(StpH1_(1.0 / 16384.0), gEndBase.x));
4404        #else
4405            gEndBase.x = StpPrxLoRcpH1(gEndBase.x);
4406        #endif
4407//------------------------------------------------------------------------------------------------------------------------------
4408        // Compute opacity term, {0 := not done, 1 := end of span}.
4409        #if (STP_GEAA_P > 2)
4410            StpH2 gUseP8 = StpSatH2(abs(gBi8 - StpH2_(gEndBase.y)) * StpH2_(gEndBase.x));
4411            StpH2 gUseP7 = StpSatH2(abs(gBi7 - StpH2_(gEndBase.y)) * StpH2_(gEndBase.x));
4412        #endif
4413        #if (STP_GEAA_P > 1)
4414            StpH2 gUseP6 = StpSatH2(abs(gBi6 - StpH2_(gEndBase.y)) * StpH2_(gEndBase.x));
4415            StpH2 gUseP5 = StpSatH2(abs(gBi5 - StpH2_(gEndBase.y)) * StpH2_(gEndBase.x));
4416        #endif
4417        #if (STP_GEAA_P > 0)
4418            StpH2 gUseP4 = StpSatH2(abs(gBi4 - StpH2_(gEndBase.y)) * StpH2_(gEndBase.x));
4419            StpH2 gUseP3 = StpSatH2(abs(gBi3 - StpH2_(gEndBase.y)) * StpH2_(gEndBase.x));
4420        #endif
4421            StpH2 gUseP2 = StpSatH2(abs(gBi2 - StpH2_(gEndBase.y)) * StpH2_(gEndBase.x));
4422            StpH2 gUseP1 = StpSatH2(abs(gBi1 - StpH2_(gEndBase.y)) * StpH2_(gEndBase.x));
4423            StpH2 gUseP0 = StpSatH2(abs(gBi0 - StpH2_(gEndBase.y)) * StpH2_(gEndBase.x));
4424//------------------------------------------------------------------------------------------------------------------------------
4425        // Work this like painters alpha blending.
4426        // This analog path is faster and cleaner than binary logic.
4427        // Distance traveled for {negative, positive} paths.
4428        // LOGIC
4429        // =====
4430        // Note distance factors already have the 0.5 factored in.
4431        //  N := negative search end (1 pixel away, but edge is 0.5 pixel away)
4432        //  P := positive search end (4 pixel away, but edge is 3.5 pixel away)
4433        //  X := the pixel to filter
4434        //               :<->:<------------->:
4435        //               :   :               :
4436        //               :   :             +---+---+---+---+
4437        //               :   :             | : |   |   |   |
4438        //               N +---+---+---+---+-P-+---+---+---+
4439        //                 | X |   |   |   |   |   |   |   |
4440        // +---+---+---+---+---+---+---+---+---+---+---+---+
4441        // |   |   |   |   |   |   |   |   |   |   |   |   |
4442        // +---+---+---+---+---+---+---+---+---+---+---+---+
4443        #if (STP_GEAA_P == 3)
4444            StpH2 gDst2 = StpH2_(9.5);
4445        #endif
4446        #if (STP_GEAA_P == 2)
4447            StpH2 gDst2 = StpH2_(7.5);
4448        #endif
4449        #if (STP_GEAA_P == 1)
4450            StpH2 gDst2 = StpH2_(5.5);
4451        #endif
4452        #if (STP_GEAA_P == 0)
4453            StpH2 gDst2 = StpH2_(3.5);
4454        #endif
4455        #if (STP_GEAA_P > 2)
4456            gDst2 = gDst2 + (StpH2_(8.5) - gDst2) * gUseP8;
4457            gDst2 = gDst2 + (StpH2_(7.5) - gDst2) * gUseP7;
4458        #endif
4459        #if (STP_GEAA_P > 1)
4460            gDst2 = gDst2 + (StpH2_(6.5) - gDst2) * gUseP6;
4461            gDst2 = gDst2 + (StpH2_(5.5) - gDst2) * gUseP5;
4462        #endif
4463        #if (STP_GEAA_P > 0)
4464            gDst2 = gDst2 + (StpH2_(4.5) - gDst2) * gUseP4;
4465            gDst2 = gDst2 + (StpH2_(3.5) - gDst2) * gUseP3;
4466        #endif
4467            gDst2 = gDst2 + (StpH2_(2.5) - gDst2) * gUseP2;
4468            gDst2 = gDst2 + (StpH2_(1.5) - gDst2) * gUseP1;
4469            gDst2 = gDst2 + (StpH2_(0.5) - gDst2) * gUseP0;
4470//------------------------------------------------------------------------------------------------------------------------------
4471        // Run the variable length low-pass box blur.
4472        // Need half distance with half pixel removed.
4473        StpH1 gLoSub = (gDst2.x + gDst2.y) * StpH1_(0.5) - StpH1_(STP_GEAA_SUBPIX);
4474        // compute the weights (if should be included or not).
4475        StpH2 gLoW01 = StpH2_(1.0) - StpSatH2(StpH2(1.0, 2.0) - StpH2_(gLoSub));
4476        StpH2 gLoW23 = StpH2_(1.0) - StpSatH2(StpH2(3.0, 4.0) - StpH2_(gLoSub));
4477        StpH2 gLoW45 = StpH2_(1.0) - StpSatH2(StpH2(5.0, 6.0) - StpH2_(gLoSub));
4478        StpH2 gLoW67 = StpH2_(1.0) - StpSatH2(StpH2(7.0, 8.0) - StpH2_(gLoSub));
4479        StpH2 gLoW89 = StpH2_(1.0) - StpSatH2(StpH2(9.0,10.0) - StpH2_(gLoSub));
4480        // Weighted accumulation of samples.
4481        StpH2 gLoAcc2 =
4482            gLo0 * StpH2_(gLoW01.x) +
4483            gLo1 * StpH2_(gLoW01.y) +
4484            gLo2 * StpH2_(gLoW23.x) +
4485            gLo3 * StpH2_(gLoW23.y) +
4486            gLo4 * StpH2_(gLoW45.x) +
4487            gLo5 * StpH2_(gLoW45.y) +
4488            gLo6 * StpH2_(gLoW67.x) +
4489            gLo7 * StpH2_(gLoW67.y) +
4490            gLo8 * StpH2_(gLoW89.x);
4491        StpH1 gLoAcc = gE + gLoAcc2.x + gLoAcc2.y;
4492        // Weight sum.
4493        StpH2 gLoW2 = gLoW01 + gLoW23 + gLoW45 + gLoW67;
4494        gLoW2 *= StpH2_(2.0);
4495        gLoAcc *= StpRcpH1(StpH1_(1.0) + gLoW89.x * StpH1_(2.0) + gLoW2.x + gLoW2.y);
4496        // Convert to blend between self and high-contrast neighbor.
4497        // This currently allows full {0.0 to 1.0} blend.
4498        StpH1 gOff = StpSatH1((gLoAcc - gE) * StpRcpH1(gBH.x - gE));
4499        // It is important to not exceed 0.5 weight for PIXart scaling.
4500        gOff = min(gOff, StpH1_(0.5));
4501//------------------------------------------------------------------------------------------------------------------------------
4502        // Save out dilation pixel for {z,motion}.
4503        gDilate = p + gDecon;
4504        // Save out filter position.
4505        gFilter = p + gDecon * StpF2_(gOff);
4506        gLuma = lerp(gE, gBH.x, gOff);
4507//------------------------------------------------------------------------------------------------------------------------------
4508        // GEAA up to this point creates weights that only help a scalar for aliased edges.
4509        // This attempts to increase weight to also restore some anti-aliased edges.
4510        // It does this by increasing weight as much as can be borrowed from the 'E to H' side.
4511        // An equation for movement towards H,
4512        //   E+(H-E)*T  ...  Where T must be {0 to 1} ranged, but want {0 to 0.5} ranged (same as 'gOff').
4513        // Equation for E motion with respect to the B side,
4514        //   A=E+(B-E)*F  ...  Where A is the anti-aliased output, and F would typically be 'gOff'.
4515        // Solving that for E,
4516        //   E=((A-F*B)/(1-F)
4517        // Combining equations,
4518        //   E+(H-E)*T = ((A-F*B)/(1-F)
4519        // Then solving for T when 'F=0.5' (maximum 'gOff' weight),
4520        //   T=(-2*A+B+E)/(E-H)
4521        // Then limit T inside {0 to 0.5}.
4522        // And use limited 'T' to recompute a new 'F' which becomes the 'gOff' fixed weight.
4523        StpH1 gAnti = lerp(gE, gBH.x, gOff);
4524        // Solve for the movement towards 'H'.
4525        // This in theory should be limited to {0 to 0.5}, but {0 to 1} seems to work too.
4526        StpH1 gT = StpSatH1((StpH1_(-2.0) * gAnti + gBH.x + gE) * StpRcpH1(gE - gBH.y));
4527        StpH1 gFix = gE * (gT - StpH1_(1.0)) - gBH.y * gT;
4528        gFix = StpSatH1((gFix + gAnti) * StpRcpH1(gFix + gBH.x));
4529//------------------------------------------------------------------------------------------------------------------------------
4530        // Output weight for pixel art scalar.
4531        // The 'gOff'set goes between {0 := no change, to 0.5 := half to neighbor}.
4532        // The half to neighbor position would be where the edge crosses between two pixels.
4533        // The sample size needs to be {0 := at the crossing, to 1 := no change}.
4534        // Can solve this, the 1D kernel will look like,
4535        //  u = (1-x)*s ... weighting terms
4536        //  v =    x *t
4537        //  w = 1/(u+v)
4538        //  o = a*u*w + b*v*w
4539        // The split is where weights are the same,
4540        //  u*w == v*w ... ((1-x)*s)/(((1-x)*s)+(x*t)) == (x*t)/(((1-x)*s)+(x*t))
4541        // Can assume s=1.0 (the other sample), thus this reduces to,
4542        //  u*w == v*w ... (1-x)/((1-x)+(x*t)) == (x*t)/((1-x)+(x*t))
4543        // Then solve for 't' given crossing point 'x'.
4544        //  t=1/x-1
4545        // Convert to 'x=gOffset+1/2'.
4546        // Solve for 't=1/x-1', or 't=1/(gOffset+1/2)-1'.
4547        gW = gFix;
4548        gW = StpRcpH1(gW + StpH1_(0.5)) - StpH1_(1.0);
4549        // Send squared (as needed by scalar).
4550        gW *= gW;
4551        // Make sure not zero.
4552        gW = max(gW, StpH1_(1.0/255.0)); }
4553#endif // defined(STP_GPU) && defined(STP_GEAA) && defined(STP_16BIT)
4554////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
4555////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
4556////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
4557////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
4558#endif // STP_UNITY_INCLUDE_GUARD
Configure Feed

Configure Feed