A game about forced loneliness, made by TACStudios
fork

Configure Feed

Select the types of activity you want to include in your feed.

at master 4558 lines 273 kB view raw
1// This is necessary to prevent Unity from deciding that our default config logic is actually an include guard declaration 2#ifndef STP_UNITY_INCLUDE_GUARD 3#define STP_UNITY_INCLUDE_GUARD 4//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 5//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 6//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 7//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 8//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 9//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 10//_____________________________________________________________.._______________________________________________________________ 11//============================================================================================================================== 12// 13// 14// SPATIAL TEMPORAL POST [STP] v1.0 15// 16// 17//============================================================================================================================== 18//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 19//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 20//_____________________________________________________________.._______________________________________________________________ 21//============================================================================================================================== 22// C/C++/GLSL/HLSL PORTABILITY BASED ON AMD's 'ffx_a.h'. 23// INCLUDING ASSOCIATED LICENSE BELOW 24//------------------------------------------------------------------------------------------------------------------------------ 25// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. 26// Permission is hereby granted, free of charge, to any person obtaining a copy 27// of this software and associated documentation files(the "Software"), to deal 28// in the Software without restriction, including without limitation the rights 29// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell 30// copies of the Software, and to permit persons to whom the Software is 31// furnished to do so, subject to the following conditions : 32// The above copyright notice and this permission notice shall be included in 33// all copies or substantial portions of the Software. 34// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 35// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 36// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE 37// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 38// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 39// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 40// THE SOFTWARE. 41//============================================================================================================================== 42//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 43//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 44//_____________________________________________________________.._______________________________________________________________ 45//============================================================================================================================== 46// NOTES 47//------------------------------------------------------------------------------------------------------------------------------ 48// PLATFORM SPECIFIC WORKAROUNDS 49// ============================= 50// - These all default to not enabled {0}, define to {1} to enable. 51// - define STP_BUG_ALIAS16 1 .... Define to enable workaround for asuint16()/asfloat16(). 52// - define STP_BUG_PRX 1 ........ Define to disable approximate transendentals. 53// - define STP_BUG_SAT_INF 1 .... Define to workaround platforms with broken 16-bit saturate +/- INF. 54// - define STP_BUG_SAT 1 ........ Define to workaround compiler incorrectly factoring out inner saturate in 16-bit code. 55//------------------------------------------------------------------------------------------------------------------------------ 56// CONFIGURATIONS 57// ============== 58// - INDEPENDENT OPTIONS 59// - define STP_32BIT {0 := disable, 1 := compile the 32-bit version or implicit precision version} 60// - define STP_MEDIUM {0 := disable, 1 := enable the implicit medium precision version for 32-bit} 61// - define STP_16BIT {0 := disable, 1 := compile the explicit 16-bit version} 62// ----- 63// - define STP_GPU {to include shader code} 64// - define STP_GLSL {to include the GLSL version of the code} 65// - define STP_HLSL {to include the HLSL version of the code} 66// ----- 67// - define STP_DIL {to include the StpDil<H,F>() entry points} 68// - define STP_PAT {to include the StpPat<H,F>() entry points} 69// - define STP_SAA {to include the StpSaa<H,F>() entry points} 70// - define STP_TAA {to include the StpTaa<H,F>() entry points} 71// ----- 72// - define STP_POSTMAP {running STP, 0 := before, 1 := after, application tonemapping} 73//------------------------------------------------------------------------------------------------------------------------------ 74// IMPORTANT 75// ========= 76// - All callbacks should explicitly sample from MIP level 0. 77// - Meaning if used in a pixel shader do not allow implicit LOD calculation. 78// - The algorithm is tuned for pre-tonemap operation, post-tonemap wasn't tested yet. 79//============================================================================================================================== 80//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 81//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 82//_____________________________________________________________.._______________________________________________________________ 83//============================================================================================================================== 84// EXTERNAL OPTIONS 85//============================================================================================================================== 86// Enable {1} or default disable any debug functionality {0}. 87#ifndef STP_BUG 88 #define STP_BUG 0 89#endif 90//------------------------------------------------------------------------------------------------------------------------------ 91// Define to test a pass-through dummy shader that fetches all resources but does no logic. 92#ifndef STP_BUG_BW_SOL 93 #define STP_BUG_BW_SOL 0 94#endif 95//------------------------------------------------------------------------------------------------------------------------------ 96// Define to {1} to use the max/min sampling permutation for color values. 97#ifndef STP_MAX_MIN_10BIT 98 #define STP_MAX_MIN_10BIT 0 99#endif 100//------------------------------------------------------------------------------------------------------------------------------ 101// Define to {1} to use the max/min sampling permutation for UINT32 values. 102#ifndef STP_MAX_MIN_UINT 103 #define STP_MAX_MIN_UINT 0 104#endif 105//------------------------------------------------------------------------------------------------------------------------------ 106// Define to {1} to use sampling with offsets. 107#ifndef STP_OFFSETS 108 #define STP_OFFSETS 0 109#endif 110//------------------------------------------------------------------------------------------------------------------------------ 111// STP is currently only tested to run pre-tonemap at that is what Unity is using. 112// Run 0 := pre-tonemap, 1 := post-tonemap. 113#ifndef STP_POSTMAP 114 #define STP_POSTMAP 0 115#endif 116//------------------------------------------------------------------------------------------------------------------------------ 117// STP TAA quality level {0 to 1} 118#ifndef STP_TAA_Q 119 #define STP_TAA_Q 1 120#endif 121//============================================================================================================================== 122// PLATFORM SPECIFIC BUG WORKAROUNDS 123// ================================= 124// Define to {1} to disable usage of transendental approximations using float/int aliasing. 125#ifndef STP_BUG_PRX 126 #define STP_BUG_PRX 0 127#endif 128//------------------------------------------------------------------------------------------------------------------------------ 129// Define to {1} for workaround if platform cannot use saturate of +/- INF correctly. 130#ifndef STP_BUG_SAT_INF 131 #define STP_BUG_SAT_INF 0 132#endif 133//------------------------------------------------------------------------------------------------------------------------------ 134// Define to {1} for workaround for compilier incorrectly factoring out inner saturate in 16-bit code. 135#ifndef STP_BUG_SAT 136 #define STP_BUG_SAT 0 137#endif 138//------------------------------------------------------------------------------------------------------------------------------ 139// Define to {1} for workarounds for broken asuint16()/asfloat16(). 140#ifndef STP_BUG_ALIAS16 141 #define STP_BUG_ALIAS16 0 142 #undef STP_BUG_PRX 143 #define STP_BUG_PRX 1 144#endif 145//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 146//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 147//_____________________________________________________________.._______________________________________________________________ 148//============================================================================================================================== 149// C/C++/GLSL/HLSL PORTABILITY 150//============================================================================================================================== 151#if defined(STP_CPU) 152 #ifndef STP_RESTRICT 153 #define STP_RESTRICT __restrict 154 #endif 155//------------------------------------------------------------------------------------------------------------------------------ 156 #ifndef STP_STATIC 157 #define STP_STATIC static 158 #endif 159//------------------------------------------------------------------------------------------------------------------------------ 160 typedef unsigned char StpB1; 161 typedef unsigned short StpW1; 162 typedef float StpF1; 163 typedef uint32_t StpU1; 164 #define StpF1_(a) ((StpF1)(a)) 165 #define StpU1_(a) ((StpU1)(a)) 166 STP_STATIC StpU1 StpU1_F1(StpF1 a) { union { StpF1 f; StpU1 u; } bits; bits.f = a; return bits.u; } 167 #define StpOutF2 StpF1 *STP_RESTRICT 168 #define StpExp2F1(x) exp2f(x) 169 STP_STATIC StpF1 StpMaxF1(StpF1 a, StpF1 b) { return a > b ? a : b; } 170//------------------------------------------------------------------------------------------------------------------------------ 171 // Convert float to half (in lower 16-bits of output). 172 // Same fast technique as documented here: ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf 173 // Supports denormals. 174 // Conversion rules are to make computations possibly "safer" on the GPU, 175 // -INF & -NaN -> -65504 176 // +INF & +NaN -> +65504 177 STP_STATIC StpU1 StpU1_H1_F1(StpF1 f) { 178 static StpW1 base[512] = { 179 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, 180 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, 181 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, 182 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, 183 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, 184 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, 185 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0001,0x0002,0x0004,0x0008,0x0010,0x0020,0x0040,0x0080,0x0100, 186 0x0200,0x0400,0x0800,0x0c00,0x1000,0x1400,0x1800,0x1c00,0x2000,0x2400,0x2800,0x2c00,0x3000,0x3400,0x3800,0x3c00, 187 0x4000,0x4400,0x4800,0x4c00,0x5000,0x5400,0x5800,0x5c00,0x6000,0x6400,0x6800,0x6c00,0x7000,0x7400,0x7800,0x7bff, 188 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, 189 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, 190 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, 191 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, 192 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, 193 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, 194 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, 195 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, 196 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, 197 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, 198 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, 199 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, 200 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, 201 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8001,0x8002,0x8004,0x8008,0x8010,0x8020,0x8040,0x8080,0x8100, 202 0x8200,0x8400,0x8800,0x8c00,0x9000,0x9400,0x9800,0x9c00,0xa000,0xa400,0xa800,0xac00,0xb000,0xb400,0xb800,0xbc00, 203 0xc000,0xc400,0xc800,0xcc00,0xd000,0xd400,0xd800,0xdc00,0xe000,0xe400,0xe800,0xec00,0xf000,0xf400,0xf800,0xfbff, 204 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, 205 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, 206 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, 207 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, 208 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, 209 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, 210 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff }; 211 static StpB1 shift[512] = { 212 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 213 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 214 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 215 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 216 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 217 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 218 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f, 219 0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d, 220 0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18, 221 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 222 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 223 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 224 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 225 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 226 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 227 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 228 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 229 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 230 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 231 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 232 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 233 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 234 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f, 235 0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d, 236 0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18, 237 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 238 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 239 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 240 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 241 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 242 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, 243 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18 }; 244 union { StpF1 f; StpU1 u; } bits; 245 bits.f = f; StpU1 u = bits.u; StpU1 i = u >> 23; 246 return (StpU1)(base[i]) + ((u & 0x7fffff) >> shift[i]); } 247//------------------------------------------------------------------------------------------------------------------------------ 248 STP_STATIC StpU1 StpU1_H2_F2(StpInF2 a) { return StpU1_H1_F1(a[0]) + (StpU1_H1_F1(a[1]) << 16); } 249#endif // defined(STP_CPU) 250//============================================================================================================================== 251#if defined(STP_GPU) && defined(STP_GLSL) 252 #define StpP1 bool 253 #define StpP2 bvec2 254//------------------------------------------------------------------------------------------------------------------------------ 255 #define StpF1 float 256 #define StpF2 vec2 257 #define StpF3 vec3 258 #define StpF4 vec4 259//------------------------------------------------------------------------------------------------------------------------------ 260 #define StpI2 ivec2 261//------------------------------------------------------------------------------------------------------------------------------ 262 #define StpU1 uint 263 #define StpU2 uvec2 264 #define StpU3 uvec3 265 #define StpU4 uvec4 266//------------------------------------------------------------------------------------------------------------------------------ 267 #define StpF1_U1(x) uintBitsToFloat(StpU1(x)) 268 #define StpF2_U2(x) uintBitsToFloat(StpU2(x)) 269 #define StpF3_U3(x) uintBitsToFloat(StpU3(x)) 270 #define StpF4_U4(x) uintBitsToFloat(StpU4(x)) 271 #define StpU1_F1(x) floatBitsToUint(StpF1(x)) 272 #define StpU2_F2(x) floatBitsToUint(StpF2(x)) 273 #define StpU3_F3(x) floatBitsToUint(StpF3(x)) 274 #define StpU4_F4(x) floatBitsToUint(StpF4(x)) 275//------------------------------------------------------------------------------------------------------------------------------ 276 #define StpU1_H2_F2 packHalf2x16 277 #define StpF2_H2_U1 unpackHalf2x16 278//------------------------------------------------------------------------------------------------------------------------------ 279 StpU1 StpBfeU1(StpU1 src, StpU1 off, StpU1 bits) { return bitfieldExtract(src, int(off), int(bits)); } 280 // Proxy for V_BFI_B32 where the 'mask' is set as 'bits', 'mask=(1<<bits)-1', and 'bits' needs to be an immediate. 281 StpU1 StpBfiMskU1(StpU1 src, StpU1 ins, StpU1 bits) { return bitfieldInsert(src, ins, 0, int(bits)); } 282#endif // defined(STP_GPU) && defined(STP_GLSL) 283//============================================================================================================================== 284#if defined(STP_GPU) && defined(STP_GLSL) && defined(STP_16BIT) 285 #define StpH1 float16_t 286 #define StpH2 f16vec2 287 #define StpH3 f16vec3 288 #define StpH4 f16vec4 289//------------------------------------------------------------------------------------------------------------------------------ 290 #define StpW1 uint16_t 291 #define StpW2 u16vec2 292 #define StpW3 u16vec3 293 #define StpW4 u16vec4 294//------------------------------------------------------------------------------------------------------------------------------ 295 #define StpW2_U1(x) unpackUint2x16(StpU1(x)) 296 #define StpH2_U1(x) unpackFloat2x16(StpU1(x)) 297//------------------------------------------------------------------------------------------------------------------------------ 298 #define StpW1_H1(x) halfBitsToUint16(StpH1(x)) 299 #define StpW2_H2(x) halfBitsToUint16(StpH2(x)) 300 #define StpW3_H3(x) halfBitsToUint16(StpH3(x)) 301 #define StpW4_H4(x) halfBitsToUint16(StpH4(x)) 302//------------------------------------------------------------------------------------------------------------------------------ 303 #define StpH1_W1(x) uint16BitsToHalf(StpW1(x)) 304 #define StpH2_W2(x) uint16BitsToHalf(StpW2(x)) 305 #define StpH3_W3(x) uint16BitsToHalf(StpW3(x)) 306 #define StpH4_W4(x) uint16BitsToHalf(StpW4(x)) 307//------------------------------------------------------------------------------------------------------------------------------ 308 #define StpU1_H2(x) packFloat2x16(StpH2(x)) 309#endif // defined(STP_GPU) && defined(STP_GLSL) && defined(STP_16BIT) 310//============================================================================================================================== 311#if defined(STP_GPU) && defined(STP_HLSL) 312 #define StpP1 bool 313 #define StpP2 bool2 314//------------------------------------------------------------------------------------------------------------------------------ 315 #define StpF1 float 316 #define StpF2 float2 317 #define StpF3 float3 318 #define StpF4 float4 319//------------------------------------------------------------------------------------------------------------------------------ 320 #define StpI2 int2 321//------------------------------------------------------------------------------------------------------------------------------ 322 #define StpU1 uint 323 #define StpU2 uint2 324 #define StpU3 uint3 325 #define StpU4 uint4 326//------------------------------------------------------------------------------------------------------------------------------ 327 #define StpF1_U1(x) asfloat(StpU1(x)) 328 #define StpF2_U2(x) asfloat(StpU2(x)) 329 #define StpF3_U3(x) asfloat(StpU3(x)) 330 #define StpF4_U4(x) asfloat(StpU4(x)) 331 #define StpU1_F1(x) asuint(StpF1(x)) 332 #define StpU2_F2(x) asuint(StpF2(x)) 333 #define StpU3_F3(x) asuint(StpF3(x)) 334 #define StpU4_F4(x) asuint(StpF4(x)) 335//------------------------------------------------------------------------------------------------------------------------------ 336 StpU1 StpU1_H2_F2_x(StpF2 a) { return f32tof16(a.x) | (f32tof16(a.y) << 16); } 337 #define StpU1_H2_F2(a) StpU1_H2_F2_x(StpF2(a)) 338//------------------------------------------------------------------------------------------------------------------------------ 339 StpF2 StpF2_H2_U1_x(StpU1 x) { return StpF2(f16tof32(x & 0xFFFF), f16tof32(x >> 16)); } 340 #define StpF2_H2_U1(x) StpF2_H2_U1_x(StpU1(x)) 341//------------------------------------------------------------------------------------------------------------------------------ 342 StpU1 StpBfeU1(StpU1 src, StpU1 off, StpU1 bits) { StpU1 msk = (1u << bits) - 1; return (src >> off) & msk; } 343 StpU1 StpBfiMskU1(StpU1 src, StpU1 ins, StpU1 bits) { StpU1 msk = (1u << bits) - 1; return (ins & msk) | (src & (~msk)); } 344#endif // defined(STP_GPU) && defined(STP_HLSL) 345//============================================================================================================================== 346#if defined(STP_GPU) && defined(STP_HLSL) && defined(STP_MEDIUM) 347 #define StpMU1 min16uint 348 #define StpMU2 min16uint2 349 #define StpMU3 min16uint3 350 #define StpMU4 min16uint4 351//------------------------------------------------------------------------------------------------------------------------------ 352 #define StpMF1 min16float 353 #define StpMF2 min16float2 354 #define StpMF3 min16float3 355 #define StpMF4 min16float4 356#endif // defined(STP_GPU) && defined(STP_HLSL) && defined(STP_MEDIUM) 357//============================================================================================================================== 358#if defined(STP_GPU) && (!defined(STP_MEDIUM)) 359 #define StpMU1 StpU1 360 #define StpMU2 StpU2 361 #define StpMU3 StpU3 362 #define StpMU4 StpU4 363//------------------------------------------------------------------------------------------------------------------------------ 364 #define StpMF1 StpF1 365 #define StpMF2 StpF2 366 #define StpMF3 StpF3 367 #define StpMF4 StpF4 368#endif // defined(STP_GPU) && (!defined(STP_MEDIUM)) 369//============================================================================================================================== 370#if defined(STP_GPU) && defined(STP_HLSL) && defined(STP_16BIT) 371 #define StpH1 float16_t 372 #define StpH2 float16_t2 373 #define StpH3 float16_t3 374 #define StpH4 float16_t4 375//------------------------------------------------------------------------------------------------------------------------------ 376 #define StpW1 uint16_t 377 #define StpW2 uint16_t2 378 #define StpW3 uint16_t3 379 #define StpW4 uint16_t4 380//------------------------------------------------------------------------------------------------------------------------------ 381 StpW2 StpW2_U1_x(StpU1 x) { StpU2 t = StpU2(x & 0xFFFF, x >> 16); return StpW2(t); } 382 #define StpW2_U1(x) StpW2_U1_x(StpU1(x)) 383 StpH2 StpH2_U1_x(StpU1 x) { return asfloat16(StpW2((StpW1)(x & 0xFFFF), (StpW1)(x >> 16))); } 384 #define StpH2_U1(x) StpH2_U1_x(StpU1(x)) 385//------------------------------------------------------------------------------------------------------------------------------ 386 #define StpW1_H1(x) asuint16(StpH1(x)) 387 #define StpW2_H2(x) asuint16(StpH2(x)) 388 #define StpW3_H3(x) asuint16(StpH3(x)) 389 #define StpW4_H4(x) asuint16(StpH4(x)) 390//------------------------------------------------------------------------------------------------------------------------------ 391 #define StpH1_W1(x) asfloat16(StpW1(x)) 392 #define StpH2_W2(x) asfloat16(StpW2(x)) 393 #define StpH3_W3(x) asfloat16(StpW3(x)) 394 #define StpH4_W4(x) asfloat16(StpW4(x)) 395//------------------------------------------------------------------------------------------------------------------------------ 396 StpU1 StpU1_H2_x(StpH2 x) { StpW2 t = asuint16(x); return (((StpU1)t.x) | (((StpU1)t.y) << 16)); } 397 #define StpU1_H2(x) StpU1_H2_x(StpH2(x)) 398#endif // defined(STP_GPU) && defined(STP_HLSL) && defined(STP_16BIT) 399//============================================================================================================================== 400#if defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL)) 401 StpF1 StpMaxF1(StpF1 a, StpF1 b) { return max(a, b); } 402//------------------------------------------------------------------------------------------------------------------------------ 403 StpP2 StpP2_x(StpP1 x) { return StpP2(x, x); } 404 #define StpP2_(x) StpP2_x(StpP1(x)) 405//------------------------------------------------------------------------------------------------------------------------------ 406 StpF1 StpF1_x(StpF1 x) { return StpF1(x); } 407 StpF2 StpF2_x(StpF1 x) { return StpF2(x, x); } 408 StpF3 StpF3_x(StpF1 x) { return StpF3(x, x, x); } 409 StpF4 StpF4_x(StpF1 x) { return StpF4(x, x, x, x); } 410 #define StpF1_(x) StpF1_x(StpF1(x)) 411 #define StpF2_(x) StpF2_x(StpF1(x)) 412 #define StpF3_(x) StpF3_x(StpF1(x)) 413 #define StpF4_(x) StpF4_x(StpF1(x)) 414//------------------------------------------------------------------------------------------------------------------------------ 415 StpMF1 StpMF1_x(StpMF1 x) { return StpMF1(x); } 416 StpMF2 StpMF2_x(StpMF1 x) { return StpMF2(x, x); } 417 StpMF3 StpMF3_x(StpMF1 x) { return StpMF3(x, x, x); } 418 StpMF4 StpMF4_x(StpMF1 x) { return StpMF4(x, x, x, x); } 419 #define StpMF1_(x) StpMF1_x(StpMF1(x)) 420 #define StpMF2_(x) StpMF2_x(StpMF1(x)) 421 #define StpMF3_(x) StpMF3_x(StpMF1(x)) 422 #define StpMF4_(x) StpMF4_x(StpMF1(x)) 423//------------------------------------------------------------------------------------------------------------------------------ 424 StpMU1 StpMU1_x(StpMU1 x) { return StpMU1(x); } 425 StpMU2 StpMU2_x(StpMU1 x) { return StpMU2(x, x); } 426 StpMU3 StpMU3_x(StpMU1 x) { return StpMU3(x, x, x); } 427 StpMU4 StpMU4_x(StpMU1 x) { return StpMU4(x, x, x, x); } 428 #define StpMU1_(x) StpMU1_x(StpMU1(x)) 429 #define StpMU2_(x) StpMU2_x(StpMU1(x)) 430 #define StpMU3_(x) StpMU3_x(StpMU1(x)) 431 #define StpMU4_(x) StpMU4_x(StpMU1(x)) 432//------------------------------------------------------------------------------------------------------------------------------ 433 StpU1 StpU1_x(StpU1 x) { return StpU1(x); } 434 StpU2 StpU2_x(StpU1 x) { return StpU2(x, x); } 435 StpU3 StpU3_x(StpU1 x) { return StpU3(x, x, x); } 436 StpU4 StpU4_x(StpU1 x) { return StpU4(x, x, x, x); } 437 #define StpU1_(x) StpU1_x(StpU1(x)) 438 #define StpU2_(x) StpU2_x(StpU1(x)) 439 #define StpU3_(x) StpU3_x(StpU1(x)) 440 #define StpU4_(x) StpU4_x(StpU1(x)) 441//------------------------------------------------------------------------------------------------------------------------------ 442 #if 0 443 // Slow implementation (if not pattern matched by a compiler). 444 StpF1 StpCpySgnF1(StpF1 d, StpF1 s) { return StpF1_U1(StpU1_F1(d) | (StpU1_F1(s) & StpU1_(0x80000000u))); } 445 StpF2 StpCpySgnF2(StpF2 d, StpF2 s) { return StpF2_U2(StpU2_F2(d) | (StpU2_F2(s) & StpU2_(0x80000000u))); } 446 StpF3 StpCpySgnF3(StpF3 d, StpF3 s) { return StpF3_U3(StpU3_F3(d) | (StpU3_F3(s) & StpU3_(0x80000000u))); } 447 StpF4 StpCpySgnF4(StpF4 d, StpF4 s) { return StpF4_U4(StpU4_F4(d) | (StpU4_F4(s) & StpU4_(0x80000000u))); } 448 #else 449 // Faster implementation (one portable BFI). 450 StpF1 StpCpySgnF1(StpF1 d, StpF1 s) { return StpF1_U1(StpBfiMskU1(StpU1_F1(s), StpU1_F1(d), StpU1_(31))); } 451 StpF2 StpCpySgnF2(StpF2 d, StpF2 s) { return StpF2(StpCpySgnF1(d.x, s.x), StpCpySgnF1(d.y, s.y)); } 452 StpF3 StpCpySgnF3(StpF3 d, StpF3 s) { 453 return StpF3(StpCpySgnF1(d.x, s.x), StpCpySgnF1(d.y, s.y), StpCpySgnF1(d.z, s.z)); } 454 StpF4 StpCpySgnF4(StpF4 d, StpF4 s) { 455 return StpF4(StpCpySgnF1(d.x, s.x), StpCpySgnF1(d.y, s.y), StpCpySgnF1(d.z, s.z), StpCpySgnF1(d.w, s.w)); } 456 #endif 457 StpF1 StpMax3F1(StpF1 x, StpF1 y, StpF1 z) { return max(x, max(y, z)); } 458 StpF2 StpMax3F2(StpF2 x, StpF2 y, StpF2 z) { return max(x, max(y, z)); } 459 StpF3 StpMax3F3(StpF3 x, StpF3 y, StpF3 z) { return max(x, max(y, z)); } 460 StpF4 StpMax3F4(StpF4 x, StpF4 y, StpF4 z) { return max(x, max(y, z)); } 461 StpF1 StpMin3F1(StpF1 x, StpF1 y, StpF1 z) { return min(x, min(y, z)); } 462 StpF2 StpMin3F2(StpF2 x, StpF2 y, StpF2 z) { return min(x, min(y, z)); } 463 StpF3 StpMin3F3(StpF3 x, StpF3 y, StpF3 z) { return min(x, min(y, z)); } 464 StpF4 StpMin3F4(StpF4 x, StpF4 y, StpF4 z) { return min(x, min(y, z)); } 465 StpU1 StpMax3U1(StpU1 x, StpU1 y, StpU1 z) { return max(x, max(y, z)); } 466 StpU1 StpMin3U1(StpU1 x, StpU1 y, StpU1 z) { return min(x, min(y, z)); } 467 StpU4 StpMin3U4(StpU4 x, StpU4 y, StpU4 z) { return min(x, min(y, z)); } 468//------------------------------------------------------------------------------------------------------------------------------ 469 StpMF1 StpMax3MF1(StpMF1 x, StpMF1 y, StpMF1 z) { return max(x, max(y, z)); } 470 StpMF2 StpMax3MF2(StpMF2 x, StpMF2 y, StpMF2 z) { return max(x, max(y, z)); } 471 StpMF3 StpMax3MF3(StpMF3 x, StpMF3 y, StpMF3 z) { return max(x, max(y, z)); } 472 StpMF4 StpMax3MF4(StpMF4 x, StpMF4 y, StpMF4 z) { return max(x, max(y, z)); } 473 StpMF1 StpMin3MF1(StpMF1 x, StpMF1 y, StpMF1 z) { return min(x, min(y, z)); } 474 StpMF2 StpMin3MF2(StpMF2 x, StpMF2 y, StpMF2 z) { return min(x, min(y, z)); } 475 StpMF3 StpMin3MF3(StpMF3 x, StpMF3 y, StpMF3 z) { return min(x, min(y, z)); } 476 StpMF4 StpMin3MF4(StpMF4 x, StpMF4 y, StpMF4 z) { return min(x, min(y, z)); } 477//------------------------------------------------------------------------------------------------------------------------------ 478 // Make {<+0 := -1.0, >=+0 := 1.0}. 479 StpF1 StpSgnOneF1(StpF1 x) { return StpF1_U1(StpBfiMskU1(StpU1_F1(x), StpU1_(0x3f800000), StpU1_(31))); } 480#endif // defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL)) 481//============================================================================================================================== 482#if defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL)) && defined(STP_16BIT) 483 StpH1 StpH1_x(StpH1 x) { return StpH1(x); } 484 StpH2 StpH2_x(StpH1 x) { return StpH2(x, x); } 485 StpH3 StpH3_x(StpH1 x) { return StpH3(x, x, x); } 486 StpH4 StpH4_x(StpH1 x) { return StpH4(x, x, x, x); } 487 #define StpH1_(x) StpH1_x(StpH1(x)) 488 #define StpH2_(x) StpH2_x(StpH1(x)) 489 #define StpH3_(x) StpH3_x(StpH1(x)) 490 #define StpH4_(x) StpH4_x(StpH1(x)) 491//------------------------------------------------------------------------------------------------------------------------------ 492 StpW1 StpW1_x(StpW1 x) { return StpW1(x); } 493 StpW2 StpW2_x(StpW1 x) { return StpW2(x, x); } 494 StpW3 StpW3_x(StpW1 x) { return StpW3(x, x, x); } 495 StpW4 StpW4_x(StpW1 x) { return StpW4(x, x, x, x); } 496 #define StpW1_(x) StpW1_x(StpW1(x)) 497 #define StpW2_(x) StpW2_x(StpW1(x)) 498 #define StpW3_(x) StpW3_x(StpW1(x)) 499 #define StpW4_(x) StpW4_x(StpW1(x)) 500//------------------------------------------------------------------------------------------------------------------------------ 501 StpH1 StpMax3H1(StpH1 x, StpH1 y, StpH1 z) { return max(x, max(y, z)); } 502 StpH2 StpMax3H2(StpH2 x, StpH2 y, StpH2 z) { return max(x, max(y, z)); } 503 StpH3 StpMax3H3(StpH3 x, StpH3 y, StpH3 z) { return max(x, max(y, z)); } 504 StpH4 StpMax3H4(StpH4 x, StpH4 y, StpH4 z) { return max(x, max(y, z)); } 505 StpH1 StpMin3H1(StpH1 x, StpH1 y, StpH1 z) { return min(x, min(y, z)); } 506 StpH2 StpMin3H2(StpH2 x, StpH2 y, StpH2 z) { return min(x, min(y, z)); } 507 StpH3 StpMin3H3(StpH3 x, StpH3 y, StpH3 z) { return min(x, min(y, z)); } 508 StpH4 StpMin3H4(StpH4 x, StpH4 y, StpH4 z) { return min(x, min(y, z)); } 509 StpW1 StpMax3W1(StpW1 x, StpW1 y, StpW1 z) { return max(x, max(y, z)); } 510 StpW1 StpMin3W1(StpW1 x, StpW1 y, StpW1 z) { return min(x, min(y, z)); } 511#endif // defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL)) && defined(STP_16BIT) 512//============================================================================================================================== 513#if defined(STP_GPU) && defined(STP_GLSL) 514 StpF1 StpFractF1(StpF1 x) { return fract(x); } 515 StpF2 StpFractF2(StpF2 x) { return fract(x); } 516 StpF3 StpFractF3(StpF3 x) { return fract(x); } 517 StpF4 StpFractF4(StpF4 x) { return fract(x); } 518 StpF1 StpLerpF1(StpF1 x, StpF1 y, StpF1 z) { return mix(x, y, z); } 519 StpF2 StpLerpF2(StpF2 x, StpF2 y, StpF2 z) { return mix(x, y, z); } 520 StpF3 StpLerpF3(StpF3 x, StpF3 y, StpF3 z) { return mix(x, y, z); } 521 StpF4 StpLerpF4(StpF4 x, StpF4 y, StpF4 z) { return mix(x, y, z); } 522 StpF1 StpRcpF1(StpF1 x) { return StpF1_(1.0) / x; } 523 StpF2 StpRcpF2(StpF2 x) { return StpF2_(1.0) / x; } 524 StpF3 StpRcpF3(StpF3 x) { return StpF3_(1.0) / x; } 525 StpF4 StpRcpF4(StpF4 x) { return StpF4_(1.0) / x; } 526 StpF1 StpRsqF1(StpF1 x) { return inversesqrt(x); } 527 StpF2 StpRsqF2(StpF2 x) { return inversesqrt(x); } 528 StpF3 StpRsqF3(StpF3 x) { return inversesqrt(x); } 529 StpF4 StpRsqF4(StpF4 x) { return inversesqrt(x); } 530 StpF1 StpSatF1(StpF1 x) { return clamp(x, StpF1_(0.0), StpF1_(1.0)); } 531 StpF2 StpSatF2(StpF2 x) { return clamp(x, StpF2_(0.0), StpF2_(1.0)); } 532 StpF3 StpSatF3(StpF3 x) { return clamp(x, StpF3_(0.0), StpF3_(1.0)); } 533 StpF4 StpSatF4(StpF4 x) { return clamp(x, StpF4_(0.0), StpF4_(1.0)); } 534//------------------------------------------------------------------------------------------------------------------------------ 535 StpMF1 StpFractMF1(StpMF1 x) { return fract(x); } 536 StpMF2 StpFractMF2(StpMF2 x) { return fract(x); } 537 StpMF3 StpFractMF3(StpMF3 x) { return fract(x); } 538 StpMF4 StpFractMF4(StpMF4 x) { return fract(x); } 539 StpMF1 StpLerpMF1(StpMF1 x, StpMF1 y, StpMF1 z) { return mix(x, y, z); } 540 StpMF2 StpLerpMF2(StpMF2 x, StpMF2 y, StpMF2 z) { return mix(x, y, z); } 541 StpMF3 StpLerpMF3(StpMF3 x, StpMF3 y, StpMF3 z) { return mix(x, y, z); } 542 StpMF4 StpLerpMF4(StpMF4 x, StpMF4 y, StpMF4 z) { return mix(x, y, z); } 543 StpMF1 StpRcpMF1(StpMF1 x) { return StpMF1_(1.0) / x; } 544 StpMF2 StpRcpMF2(StpMF2 x) { return StpMF2_(1.0) / x; } 545 StpMF3 StpRcpMF3(StpMF3 x) { return StpMF3_(1.0) / x; } 546 StpMF4 StpRcpMF4(StpMF4 x) { return StpMF4_(1.0) / x; } 547 StpMF1 StpRsqMF1(StpMF1 x) { return inversesqrt(x); } 548 StpMF2 StpRsqMF2(StpMF2 x) { return inversesqrt(x); } 549 StpMF3 StpRsqMF3(StpMF3 x) { return inversesqrt(x); } 550 StpMF4 StpRsqMF4(StpMF4 x) { return inversesqrt(x); } 551 StpMF1 StpSatMF1(StpMF1 x) { return clamp(x, StpMF1_(0.0), StpMF1_(1.0)); } 552 StpMF2 StpSatMF2(StpMF2 x) { return clamp(x, StpMF2_(0.0), StpMF2_(1.0)); } 553 StpMF3 StpSatMF3(StpMF3 x) { return clamp(x, StpMF3_(0.0), StpMF3_(1.0)); } 554 StpMF4 StpSatMF4(StpMF4 x) { return clamp(x, StpMF4_(0.0), StpMF4_(1.0)); } 555#endif // defined(STP_GPU) && defined(STP_GLSL) 556//============================================================================================================================== 557#if defined(STP_GPU) && defined(STP_GLSL) && defined(STP_16BIT) 558 StpH1 StpFractH1(StpH1 x) { return fract(x); } 559 StpH2 StpFractH2(StpH2 x) { return fract(x); } 560 StpH3 StpFractH3(StpH3 x) { return fract(x); } 561 StpH4 StpFractH4(StpH4 x) { return fract(x); } 562 StpH1 StpLerpH1(StpH1 x, StpH1 y, StpH1 z) { return mix(x, y, z); } 563 StpH2 StpLerpH2(StpH2 x, StpH2 y, StpH2 z) { return mix(x, y, z); } 564 StpH3 StpLerpH3(StpH3 x, StpH3 y, StpH3 z) { return mix(x, y, z); } 565 StpH4 StpLerpH4(StpH4 x, StpH4 y, StpH4 z) { return mix(x, y, z); } 566 StpH1 StpRcpH1(StpH1 x) { return StpH1_(1.0) / x; } 567 StpH2 StpRcpH2(StpH2 x) { return StpH2_(1.0) / x; } 568 StpH3 StpRcpH3(StpH3 x) { return StpH3_(1.0) / x; } 569 StpH4 StpRcpH4(StpH4 x) { return StpH4_(1.0) / x; } 570 StpH1 StpRsqH1(StpH1 x) { return inversesqrt(x); } 571 StpH2 StpRsqH2(StpH2 x) { return inversesqrt(x); } 572 StpH3 StpRsqH3(StpH3 x) { return inversesqrt(x); } 573 StpH4 StpRsqH4(StpH4 x) { return inversesqrt(x); } 574 StpH1 StpSatH1(StpH1 x) { return clamp(x, StpH1_(0.0), StpH1_(1.0)); } 575 StpH2 StpSatH2(StpH2 x) { return clamp(x, StpH2_(0.0), StpH2_(1.0)); } 576 StpH3 StpSatH3(StpH3 x) { return clamp(x, StpH3_(0.0), StpH3_(1.0)); } 577 StpH4 StpSatH4(StpH4 x) { return clamp(x, StpH4_(0.0), StpH4_(1.0)); } 578#endif // defined(STP_GPU) && defined(STP_GLSL) && defined(STP_16BIT) 579//============================================================================================================================== 580#if defined(STP_GPU) && defined(STP_HLSL) 581 StpF1 StpFractF1(StpF1 x) { return x - floor(x); } 582 StpF2 StpFractF2(StpF2 x) { return x - floor(x); } 583 StpF3 StpFractF3(StpF3 x) { return x - floor(x); } 584 StpF4 StpFractF4(StpF4 x) { return x - floor(x); } 585 StpF1 StpLerpF1(StpF1 x, StpF1 y, StpF1 z) { return lerp(x, y, z); } 586 StpF2 StpLerpF2(StpF2 x, StpF2 y, StpF2 z) { return lerp(x, y, z); } 587 StpF3 StpLerpF3(StpF3 x, StpF3 y, StpF3 z) { return lerp(x, y, z); } 588 StpF4 StpLerpF4(StpF4 x, StpF4 y, StpF4 z) { return lerp(x, y, z); } 589 StpF1 StpRcpF1(StpF1 x) { return rcp(x); } 590 StpF2 StpRcpF2(StpF2 x) { return rcp(x); } 591 StpF3 StpRcpF3(StpF3 x) { return rcp(x); } 592 StpF4 StpRcpF4(StpF4 x) { return rcp(x); } 593 StpF1 StpRsqF1(StpF1 x) { return rsqrt(x); } 594 StpF2 StpRsqF2(StpF2 x) { return rsqrt(x); } 595 StpF3 StpRsqF3(StpF3 x) { return rsqrt(x); } 596 StpF4 StpRsqF4(StpF4 x) { return rsqrt(x); } 597 StpF1 StpSatF1(StpF1 x) { return saturate(x); } 598 StpF2 StpSatF2(StpF2 x) { return saturate(x); } 599 StpF3 StpSatF3(StpF3 x) { return saturate(x); } 600 StpF4 StpSatF4(StpF4 x) { return saturate(x); } 601//------------------------------------------------------------------------------------------------------------------------------ 602 StpMF1 StpFractMF1(StpMF1 x) { return x - floor(x); } 603 StpMF2 StpFractMF2(StpMF2 x) { return x - floor(x); } 604 StpMF3 StpFractMF3(StpMF3 x) { return x - floor(x); } 605 StpMF4 StpFractMF4(StpMF4 x) { return x - floor(x); } 606 StpMF1 StpLerpMF1(StpMF1 x, StpMF1 y, StpMF1 z) { return lerp(x, y, z); } 607 StpMF2 StpLerpMF2(StpMF2 x, StpMF2 y, StpMF2 z) { return lerp(x, y, z); } 608 StpMF3 StpLerpMF3(StpMF3 x, StpMF3 y, StpMF3 z) { return lerp(x, y, z); } 609 StpMF4 StpLerpMF4(StpMF4 x, StpMF4 y, StpMF4 z) { return lerp(x, y, z); } 610 StpMF1 StpRcpMF1(StpMF1 x) { return rcp(x); } 611 StpMF2 StpRcpMF2(StpMF2 x) { return rcp(x); } 612 StpMF3 StpRcpMF3(StpMF3 x) { return rcp(x); } 613 StpMF4 StpRcpMF4(StpMF4 x) { return rcp(x); } 614 StpMF1 StpRsqMF1(StpMF1 x) { return rsqrt(x); } 615 StpMF2 StpRsqMF2(StpMF2 x) { return rsqrt(x); } 616 StpMF3 StpRsqMF3(StpMF3 x) { return rsqrt(x); } 617 StpMF4 StpRsqMF4(StpMF4 x) { return rsqrt(x); } 618 StpMF1 StpSatMF1(StpMF1 x) { return saturate(x); } 619 StpMF2 StpSatMF2(StpMF2 x) { return saturate(x); } 620 StpMF3 StpSatMF3(StpMF3 x) { return saturate(x); } 621 StpMF4 StpSatMF4(StpMF4 x) { return saturate(x); } 622#endif // defined(STP_GPU) && defined(STP_HLSL) 623//============================================================================================================================== 624#if defined(STP_GPU) && defined(STP_HLSL) && defined(STP_16BIT) 625 StpH1 StpFractH1(StpH1 x) { return x - floor(x); } 626 StpH2 StpFractH2(StpH2 x) { return x - floor(x); } 627 StpH3 StpFractH3(StpH3 x) { return x - floor(x); } 628 StpH4 StpFractH4(StpH4 x) { return x - floor(x); } 629 StpH1 StpLerpH1(StpH1 x, StpH1 y, StpH1 z) { return lerp(x, y, z); } 630 StpH2 StpLerpH2(StpH2 x, StpH2 y, StpH2 z) { return lerp(x, y, z); } 631 StpH3 StpLerpH3(StpH3 x, StpH3 y, StpH3 z) { return lerp(x, y, z); } 632 StpH4 StpLerpH4(StpH4 x, StpH4 y, StpH4 z) { return lerp(x, y, z); } 633 StpH1 StpRcpH1(StpH1 x) { return rcp(x); } 634 StpH2 StpRcpH2(StpH2 x) { return rcp(x); } 635 StpH3 StpRcpH3(StpH3 x) { return rcp(x); } 636 StpH4 StpRcpH4(StpH4 x) { return rcp(x); } 637 StpH1 StpRsqH1(StpH1 x) { return rsqrt(x); } 638 StpH2 StpRsqH2(StpH2 x) { return rsqrt(x); } 639 StpH3 StpRsqH3(StpH3 x) { return rsqrt(x); } 640 StpH4 StpRsqH4(StpH4 x) { return rsqrt(x); } 641 StpH1 StpSatH1(StpH1 x) { return saturate(x); } 642 StpH2 StpSatH2(StpH2 x) { return saturate(x); } 643 StpH3 StpSatH3(StpH3 x) { return saturate(x); } 644 StpH4 StpSatH4(StpH4 x) { return saturate(x); } 645#endif // defined(STP_GPU) && defined(STP_HLSL) && defined(STP_16BIT) 646//============================================================================================================================== 647#if defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL)) 648 StpF1 StpExp2F1(StpF1 x) { return exp2(x); } 649 StpF1 StpLog2F1(StpF1 x) { return log2(x); } 650//------------------------------------------------------------------------------------------------------------------------------ 651 StpMF1 StpExp2MF1(StpMF1 x) { return exp2(x); } 652 StpMF1 StpLog2MF1(StpMF1 x) { return log2(x); } 653//------------------------------------------------------------------------------------------------------------------------------ 654 #define STP_INFN_F StpF1_U1(0xff800000u) 655 #define STP_INFP_F StpF1_U1(0x7f800000u) 656 #if STP_BUG_SAT_INF 657 // Defined if unable to use the fast path because of problem related to saturating +/- INF. 658 StpF1 StpGtZeroF1(StpF1 x) { return (x > StpF1_(0.0)) ? StpF1_(1.0) : StpF1_(0.0); } 659 StpF3 StpGtZeroF3(StpF3 x) { return StpF3(StpGtZeroF1(x.r), StpGtZeroF1(x.g), StpGtZeroF1(x.b)); } 660 StpF4 StpGtZeroF4(StpF4 x) { return StpF4(StpGtZeroF1(x.r), StpGtZeroF1(x.g), 661 StpGtZeroF1(x.b), StpGtZeroF1(x.a)); } 662 StpF1 StpSignedF1(StpF1 x) { return (x < StpF1_(0.0)) ? StpF1_(1.0) : StpF1_(0.0); } 663 StpF2 StpSignedF2(StpF2 x) { return StpF2(StpSignedF1(x.r), StpSignedF1(x.g)); } 664 StpF3 StpSignedF3(StpF3 x) { return StpF3(StpSignedF1(x.r), StpSignedF1(x.g), StpSignedF1(x.b)); } 665 StpF4 StpSignedF4(StpF4 x) { return StpF4(StpSignedF1(x.r), StpSignedF1(x.g), 666 StpSignedF1(x.b), StpSignedF1(x.a)); } 667 #else 668 StpF1 StpGtZeroF1(StpF1 x) { return StpSatF1(x * StpF1_(STP_INFP_F)); } 669 StpF3 StpGtZeroF3(StpF3 x) { return StpSatF3(x * StpF3_(STP_INFP_F)); } 670 StpF4 StpGtZeroF4(StpF4 x) { return StpSatF4(x * StpF4_(STP_INFP_F)); } 671 StpF1 StpSignedF1(StpF1 x) { return StpSatF1(x * StpF1_(STP_INFN_F)); } 672 StpF2 StpSignedF2(StpF2 x) { return StpSatF2(x * StpF2_(STP_INFN_F)); } 673 StpF3 StpSignedF3(StpF3 x) { return StpSatF3(x * StpF3_(STP_INFN_F)); } 674 StpF4 StpSignedF4(StpF4 x) { return StpSatF4(x * StpF4_(STP_INFN_F)); } 675 #endif // STP_BUG_SAT_INF 676//------------------------------------------------------------------------------------------------------------------------------ 677 #if STP_BUG_PRX 678 StpF1 StpPrxLoSqrtF1(StpF1 a) { return sqrt(a); } 679 StpF3 StpPrxLoSqrtF3(StpF3 a) { return sqrt(a); } 680 StpF4 StpPrxLoSqrtF4(StpF4 a) { return sqrt(a); } 681 #else 682 StpF1 StpPrxLoSqrtF1(StpF1 a) { return StpF1_U1((StpU1_F1(a) >> StpU1_(1)) + StpU1_(0x1fbc4639)); } 683 StpF3 StpPrxLoSqrtF3(StpF3 a) { return StpF3_U3((StpU3_F3(a) >> StpU3_(1)) + StpU3_(0x1fbc4639)); } 684 StpF4 StpPrxLoSqrtF4(StpF4 a) { return StpF4_U4((StpU4_F4(a) >> StpU4_(1)) + StpU4_(0x1fbc4639)); } 685 #endif // STP_BUG_PRX 686//------------------------------------------------------------------------------------------------------------------------------ 687 #if STP_BUG_PRX 688 StpF1 StpPrxLoRcpF1(StpF1 a) { return StpRcpF1(a); } 689 StpF2 StpPrxLoRcpF2(StpF2 a) { return StpRcpF2(a); } 690 StpF3 StpPrxLoRcpF3(StpF3 a) { return StpRcpF3(a); } 691 StpF4 StpPrxLoRcpF4(StpF4 a) { return StpRcpF4(a); } 692 StpF1 StpPrxMedRcpF1(StpF1 a) { return StpRcpF1(a); } 693 StpF3 StpPrxMedRcpF3(StpF3 a) { return StpRcpF3(a); } 694 #else 695 StpF1 StpPrxLoRcpF1(StpF1 a) { return StpF1_U1(StpU1_(0x7ef07ebb) - StpU1_F1(a)); } 696 StpF2 StpPrxLoRcpF2(StpF2 a) { return StpF2_U2(StpU2_(0x7ef07ebb) - StpU2_F2(a)); } 697 StpF3 StpPrxLoRcpF3(StpF3 a) { return StpF3_U3(StpU3_(0x7ef07ebb) - StpU3_F3(a)); } 698 StpF4 StpPrxLoRcpF4(StpF4 a) { return StpF4_U4(StpU4_(0x7ef07ebb) - StpU4_F4(a)); } 699 StpF1 StpPrxMedRcpF1(StpF1 a) { StpF1 b = StpF1_U1(StpU1_(0x7ef19fff) - StpU1_F1(a)); 700 return b * (-b * a + StpF1_(2.0)); } 701 StpF3 StpPrxMedRcpF3(StpF3 a) { StpF3 b = StpF3_U3(StpU3_(0x7ef19fff) - StpU3_F3(a)); 702 return b * (-b * a + StpF3_(2.0)); } 703 #endif // STP_BUG_PRX 704//------------------------------------------------------------------------------------------------------------------------------ 705 #define STP_STATIC /* */ 706 #define StpInF2 in StpF2 707 #define StpInF4 in StpF4 708 #define StpInOutU4 inout StpU4 709 #define StpOutF2 out StpF2 710 #define StpVarF2 StpF2 711#endif // defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL)) 712//============================================================================================================================== 713#if defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL)) && defined(STP_MEDIUM) 714 #if STP_BUG_SAT_INF 715 // Defined if unable to use the fast path because of problem related to saturating +/- INF. 716 StpMF1 StpGtZeroMF1(StpMF1 x) { return (x > StpMF1_(0.0)) ? StpMF1_(1.0) : StpMF1_(0.0); } 717 StpMF3 StpGtZeroMF3(StpMF3 x) { return StpMF3(StpGtZeroMF1(x.r), StpGtZeroMF1(x.g), StpGtZeroMF1(x.b)); } 718 StpMF4 StpGtZeroMF4(StpMF4 x) { return StpMF4(StpGtZeroMF1(x.r), StpGtZeroMF1(x.g), 719 StpGtZeroMF1(x.b), StpGtZeroMF1(x.a)); } 720 StpMF1 StpSignedMF1(StpMF1 x) { return (x < StpMF1_(0.0)) ? StpMF1_(1.0) : StpMF1_(0.0); } 721 StpMF2 StpSignedMF2(StpMF2 x) { return StpMF2(StpSignedMF1(x.r), StpSignedMF1(x.g)); } 722 StpMF3 StpSignedMF3(StpMF3 x) { return StpMF3(StpSignedMF1(x.r), StpSignedMF1(x.g), StpSignedMF1(x.b)); } 723 StpMF4 StpSignedMF4(StpMF4 x) { return StpMF4(StpSignedMF1(x.r), StpSignedMF1(x.g), 724 StpSignedMF1(x.b), StpSignedMF1(x.a)); } 725 #elif STP_BUG_SAT 726 // Defined if compiler factors out saturation incorrectly. 727 #define STP_INFN_MF StpMF1(StpF1_U1(0xff800000u)) 728 #define STP_INFP_MF StpMF1(StpF1_U1(0x7f800000u)) 729 StpMF1 StpGtZeroMF1(StpMF1 x) { return max(min(x * StpMF1_(STP_INFP_MF), StpMF1_(1.0)), StpMF1_(0.0)); } 730 StpMF3 StpGtZeroMF3(StpMF3 x) { return max(min(x * StpMF3_(STP_INFP_MF), StpMF3_(1.0)), StpMF3_(0.0)); } 731 StpMF4 StpGtZeroMF4(StpMF4 x) { return max(min(x * StpMF4_(STP_INFP_MF), StpMF4_(1.0)), StpMF4_(0.0)); } 732 StpMF1 StpSignedMF1(StpMF1 x) { return max(min(x * StpMF1_(STP_INFN_MF), StpMF1_(1.0)), StpMF1_(0.0)); } 733 StpMF2 StpSignedMF2(StpMF2 x) { return max(min(x * StpMF2_(STP_INFN_MF), StpMF2_(1.0)), StpMF2_(0.0)); } 734 StpMF3 StpSignedMF3(StpMF3 x) { return max(min(x * StpMF3_(STP_INFN_MF), StpMF3_(1.0)), StpMF3_(0.0)); } 735 StpMF4 StpSignedMF4(StpMF4 x) { return max(min(x * StpMF4_(STP_INFN_MF), StpMF4_(1.0)), StpMF4_(0.0)); } 736 #else 737 // Using +/- INF typecast down to medium precision. 738 #define STP_INFN_MF StpMF1(StpF1_U1(0xff800000u)) 739 #define STP_INFP_MF StpMF1(StpF1_U1(0x7f800000u)) 740 StpMF1 StpGtZeroMF1(StpMF1 x) { return StpSatMF1(x * StpMF1_(STP_INFP_MF)); } 741 StpMF3 StpGtZeroMF3(StpMF3 x) { return StpSatMF3(x * StpMF3_(STP_INFP_MF)); } 742 StpMF4 StpGtZeroMF4(StpMF4 x) { return StpSatMF4(x * StpMF4_(STP_INFP_MF)); } 743 StpMF1 StpSignedMF1(StpMF1 x) { return StpSatMF1(x * StpMF1_(STP_INFN_MF)); } 744 StpMF2 StpSignedMF2(StpMF2 x) { return StpSatMF2(x * StpMF2_(STP_INFN_MF)); } 745 StpMF3 StpSignedMF3(StpMF3 x) { return StpSatMF3(x * StpMF3_(STP_INFN_MF)); } 746 StpMF4 StpSignedMF4(StpMF4 x) { return StpSatMF4(x * StpMF4_(STP_INFN_MF)); } 747 #endif // STP_BUG_SAT_INF 748//------------------------------------------------------------------------------------------------------------------------------ 749 // Unable to use the approximations due to not knowing what the type actually is. 750 StpMF1 StpPrxLoSqrtMF1(StpMF1 a) { return sqrt(a); } 751 StpMF3 StpPrxLoSqrtMF3(StpMF3 a) { return sqrt(a); } 752 StpMF4 StpPrxLoSqrtMF4(StpMF4 a) { return sqrt(a); } 753//------------------------------------------------------------------------------------------------------------------------------ 754 StpMF1 StpPrxLoRcpMF1(StpMF1 a) { return StpRcpMF1(a); } 755 StpMF2 StpPrxLoRcpMF2(StpMF2 a) { return StpRcpMF2(a); } 756 StpMF3 StpPrxLoRcpMF3(StpMF3 a) { return StpRcpMF3(a); } 757 StpMF4 StpPrxLoRcpMF4(StpMF4 a) { return StpRcpMF4(a); } 758 StpMF1 StpPrxMedRcpMF1(StpMF1 a) { return StpRcpMF1(a); } 759 StpMF3 StpPrxMedRcpMF3(StpMF3 a) { return StpRcpMF3(a); } 760#endif // defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL)) && defined(STP_MEDIUM) 761//============================================================================================================================== 762#if defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL)) && (!defined(STP_MEDIUM)) 763 // Same types so just use the full precision version. 764 #define StpGtZeroMF1(a) StpGtZeroF1(a) 765 #define StpGtZeroMF2(a) StpGtZeroF2(a) 766 #define StpGtZeroMF3(a) StpGtZeroF3(a) 767 #define StpGtZeroMF4(a) StpGtZeroF4(a) 768 #define StpSignedMF1(a) StpSignedF1(a) 769 #define StpSignedMF2(a) StpSignedF2(a) 770 #define StpSignedMF3(a) StpSignedF3(a) 771 #define StpSignedMF4(a) StpSignedF4(a) 772//------------------------------------------------------------------------------------------------------------------------------ 773 // The medium precision types are the same as the full precision so use the full precision approximations. 774 #define StpPrxLoSqrtMF1(a) StpPrxLoSqrtF1(a) 775 #define StpPrxLoSqrtMF3(a) StpPrxLoSqrtF3(a) 776 #define StpPrxLoSqrtMF4(a) StpPrxLoSqrtF4(a) 777//------------------------------------------------------------------------------------------------------------------------------ 778 #define StpPrxLoRcpMF1(a) StpPrxLoRcpF1(a) 779 #define StpPrxLoRcpMF2(a) StpPrxLoRcpF2(a) 780 #define StpPrxLoRcpMF3(a) StpPrxLoRcpF3(a) 781 #define StpPrxLoRcpMF4(a) StpPrxLoRcpF4(a) 782 #define StpPrxMedRcpMF1(a) StpPrxMedRcpF1(a) 783 #define StpPrxMedRcpMF3(a) StpPrxMedRcpF3(a) 784#endif // defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL)) && (!defined(STP_MEDIUM)) 785//============================================================================================================================== 786#if defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL)) && defined(STP_16BIT) 787 StpH1 StpExp2H1(StpH1 x) { return exp2(x); } 788 StpH1 StpLog2H1(StpH1 x) { return log2(x); } 789//------------------------------------------------------------------------------------------------------------------------------ 790 #if STP_BUG_ALIAS16 791 // Use 32-bit aliasing to build the +/-INF, then typecast to 16-bit. 792 #define STP_INFN_H StpH1(StpF1_U1(0xff800000u)) 793 #define STP_INFP_H StpH1(StpF1_U1(0x7f800000u)) 794 #else 795 #define STP_INFN_H StpH1_W1(StpW1_(0xfc00)) 796 #define STP_INFP_H StpH1_W1(StpW1_(0x7c00)) 797 #endif // STP_BUG_ALIAS16 798 #if STP_BUG_SAT_INF 799 StpH1 StpGtZeroH1(StpH1 x) { return (x > StpH1_(0.0)) ? StpH1_(1.0) : StpH1_(0.0); } 800 StpH2 StpGtZeroH2(StpH2 x) { return StpH2(StpGtZeroH1(x.r), StpGtZeroH1(x.g)); } 801 StpH3 StpGtZeroH3(StpH3 x) { return StpH3(StpGtZeroH1(x.r), StpGtZeroH1(x.g), StpGtZeroH1(x.b)); } 802 StpH4 StpGtZeroH4(StpH4 x) { return StpH4(StpGtZeroH1(x.r), StpGtZeroH1(x.g), 803 StpGtZeroH1(x.b), StpGtZeroH1(x.a)); } 804 StpH1 StpSignedH1(StpH1 x) { return (x < StpH1_(0.0)) ? StpH1_(1.0) : StpH1_(0.0); } 805 StpH2 StpSignedH2(StpH2 x) { return StpH2(StpSignedH1(x.r), StpSignedH1(x.g)); } 806 StpH3 StpSignedH3(StpH3 x) { return StpH3(StpSignedH1(x.r), StpSignedH1(x.g), StpSignedH1(x.b)); } 807 StpH4 StpSignedH4(StpH4 x) { return StpH4(StpSignedH1(x.r), StpSignedH1(x.g), 808 StpSignedH1(x.b), StpSignedH1(x.a)); } 809 #elif STP_BUG_SAT 810 StpH1 StpGtZeroH1(StpH1 x) { return max(min(x * StpH1_(STP_INFP_H), StpH1_(1.0)), StpH1_(0.0)); } 811 StpH2 StpGtZeroH2(StpH2 x) { return max(min(x * StpH2_(STP_INFP_H), StpH2_(1.0)), StpH2_(0.0)); } 812 StpH3 StpGtZeroH3(StpH3 x) { return max(min(x * StpH3_(STP_INFP_H), StpH3_(1.0)), StpH3_(0.0)); } 813 StpH4 StpGtZeroH4(StpH4 x) { return max(min(x * StpH4_(STP_INFP_H), StpH4_(1.0)), StpH4_(0.0)); } 814 StpH1 StpSignedH1(StpH1 x) { return max(min(x * StpH1_(STP_INFN_H), StpH1_(1.0)), StpH1_(0.0)); } 815 StpH2 StpSignedH2(StpH2 x) { return max(min(x * StpH2_(STP_INFN_H), StpH2_(1.0)), StpH2_(0.0)); } 816 StpH3 StpSignedH3(StpH3 x) { return max(min(x * StpH3_(STP_INFN_H), StpH3_(1.0)), StpH3_(0.0)); } 817 StpH4 StpSignedH4(StpH4 x) { return max(min(x * StpH4_(STP_INFN_H), StpH4_(1.0)), StpH4_(0.0)); } 818 #else 819 StpH1 StpGtZeroH1(StpH1 x) { return StpSatH1(x * StpH1_(STP_INFP_H)); } 820 StpH2 StpGtZeroH2(StpH2 x) { return StpSatH2(x * StpH2_(STP_INFP_H)); } 821 StpH3 StpGtZeroH3(StpH3 x) { return StpSatH3(x * StpH3_(STP_INFP_H)); } 822 StpH4 StpGtZeroH4(StpH4 x) { return StpSatH4(x * StpH4_(STP_INFP_H)); } 823 StpH1 StpSignedH1(StpH1 x) { return StpSatH1(x * StpH1_(STP_INFN_H)); } 824 StpH2 StpSignedH2(StpH2 x) { return StpSatH2(x * StpH2_(STP_INFN_H)); } 825 StpH3 StpSignedH3(StpH3 x) { return StpSatH3(x * StpH3_(STP_INFN_H)); } 826 StpH4 StpSignedH4(StpH4 x) { return StpSatH4(x * StpH4_(STP_INFN_H)); } 827 #endif // STP_BUG_SAT_INF 828//------------------------------------------------------------------------------------------------------------------------------ 829 #if STP_BUG_PRX 830 StpH1 StpPrxLoSqrtH1(StpH1 a) { return sqrt(a); } 831 StpH3 StpPrxLoSqrtH3(StpH3 a) { return sqrt(a); } 832 StpH4 StpPrxLoSqrtH4(StpH4 a) { return sqrt(a); } 833 #else 834 StpH1 StpPrxLoSqrtH1(StpH1 a) { return StpH1_W1((StpW1_H1(a) >> StpW1_(1)) + StpW1_(0x1de2)); } 835 StpH3 StpPrxLoSqrtH3(StpH3 a) { return StpH3_W3((StpW3_H3(a) >> StpW3_(1)) + StpW3_(0x1de2)); } 836 StpH4 StpPrxLoSqrtH4(StpH4 a) { return StpH4_W4((StpW4_H4(a) >> StpW4_(1)) + StpW4_(0x1de2)); } 837 #endif // STP_BUG_PRX 838//------------------------------------------------------------------------------------------------------------------------------ 839 #if STP_BUG_PRX 840 StpH1 StpPrxLoRcpH1(StpH1 a) { return StpRcpH1(a); } 841 StpH2 StpPrxLoRcpH2(StpH2 a) { return StpRcpH2(a); } 842 StpH3 StpPrxLoRcpH3(StpH3 a) { return StpRcpH3(a); } 843 StpH4 StpPrxLoRcpH4(StpH4 a) { return StpRcpH4(a); } 844 StpH1 StpPrxMedRcpH1(StpH1 a) { return StpRcpH1(a); } 845 StpH3 StpPrxMedRcpH3(StpH3 a) { return StpRcpH3(a); } 846 #else 847 // Note this will create denormals. 848 // MAPPING 849 // ------- 850 // +INF (7c00) -> -61568 851 // 65504 (7bff) -> -61600 852 // 30800 (7785) -> NaN 853 // 30784 (7784) -> 0 ........ (any input larger than 30784 will break) 854 // 1 (3c00) -> 0.9395 ... (so not energy preserving for 1.0) 855 // 0 (0000) -> 30784 856 StpH1 StpPrxLoRcpH1(StpH1 a) { return StpH1_W1(StpW1_(0x7784) - StpW1_H1(a)); } 857 StpH2 StpPrxLoRcpH2(StpH2 a) { return StpH2_W2(StpW2_(0x7784) - StpW2_H2(a)); } 858 StpH3 StpPrxLoRcpH3(StpH3 a) { return StpH3_W3(StpW3_(0x7784) - StpW3_H3(a)); } 859 StpH4 StpPrxLoRcpH4(StpH4 a) { return StpH4_W4(StpW4_(0x7784) - StpW4_H4(a)); } 860 // Anything larger than 30928 will break in this function. 861 StpH1 StpPrxMedRcpH1(StpH1 a) { StpH1 b = StpH1_W1(StpW1_(0x778d) - StpW1_H1(a)); 862 return b * (-b * a + StpH1_(2.0)); } 863 StpH3 StpPrxMedRcpH3(StpH3 a) { StpH3 b = StpH3_W3(StpW3_(0x778d) - StpW3_H3(a)); 864 return b * (-b * a + StpH3_(2.0)); } 865 #endif // STP_BUG_PRX 866#endif // defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL)) && defined(STP_16BIT) 867//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 868//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 869//_____________________________________________________________.._______________________________________________________________ 870//============================================================================================================================== 871// LANE REMAPPING 872//============================================================================================================================== 873#if defined(STP_GPU) 874 // More complex remap which is safe for both portability (different wave sizes up to 128) and for 2D wave reductions. 875 // 6543210 876 // ======= 877 // ..xx..x 878 // yy..yy. 879 // Details, 880 // LANE TO 8x16 MAPPING 881 // ==================== 882 // 00 01 08 09 10 11 18 19 883 // 02 03 0a 0b 12 13 1a 1b 884 // 04 05 0c 0d 14 15 1c 1d 885 // 06 07 0e 0f 16 17 1e 1f 886 // 20 21 28 29 30 31 38 39 887 // 22 23 2a 2b 32 33 3a 3b 888 // 24 25 2c 2d 34 35 3c 3d 889 // 26 27 2e 2f 36 37 3e 3f 890 // ....................... 891 // ... repeat the 8x8 .... 892 // .... pattern, but ..... 893 // .... for 40 to 7f ..... 894 // ....................... 895 StpU2 StpRmp8x16U2(StpU1 a) { 896 // Note the BFIs used for MSBs have "strange offsets" due to leaving space for the LSB bits replaced in the BFI. 897 return StpU2(StpBfiMskU1(StpBfeU1(a, 2u, 3u), a, 1u), 898 StpBfiMskU1(StpBfeU1(a, 3u, 4u), StpBfeU1(a, 1u, 2u), 2u)); } 899#endif // defined(STP_GPU) 900//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 901//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 902//_____________________________________________________________.._______________________________________________________________ 903//============================================================================================================================== 904// PRESETS (DON'T CHANGE) 905//============================================================================================================================== 906// High-end mobile. 907#if (STP_TAA_Q == 0) 908 #define STP_GEAA_P 1 909 #define STP_GEAA_SUBPIX (2.0 / 16.0) 910 #define STP_TAA_PEN_F1 (1.0 / 4.0) 911 #define STP_TAA_PEN_F0 (1.0 / 2.0) 912 #define STP_TAA_PEN_W (1.0 / 2.0) 913 #define STP_TAA_PRX_LANCZOS 1 914 #define STP_TAA_PRX_LANCZOS_DERING 0 915#endif // (STP_TAA_Q == 0) 916//------------------------------------------------------------------------------------------------------------------------------ 917// Desktop. 918#if (STP_TAA_Q == 1) 919 #define STP_GEAA_P 3 920 #define STP_GEAA_SUBPIX (2.0 / 16.0) 921 #define STP_TAA_PEN_F1 (1.0 / 4.0) 922 #define STP_TAA_PEN_F0 (1.0 / 2.0) 923 #define STP_TAA_PEN_W (1.0 / 2.0) 924 #define STP_TAA_PRX_LANCZOS 2 925 #define STP_TAA_PRX_LANCZOS_DERING 1 926#endif // (STP_TAA_Q == 1) 927//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 928//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 929//_____________________________________________________________.._______________________________________________________________ 930//============================================================================================================================== 931// INTERNAL TUNING (DON'T CHANGE) 932//============================================================================================================================== 933// Limits on anti-flicker weighting, tuning for range and precision challenges of FP16. 934#define STP_ANTI_MAX 8192.0 935// Using '1/8192' provides known problems on some platforms that are 16-bit precision challenged. 936#define STP_ANTI_MIN (1.0 / 4096.0) 937//------------------------------------------------------------------------------------------------------------------------------ 938#define STP_DITHER_DEPTH 1 939#define STP_DITHER_MOTION 1 940//------------------------------------------------------------------------------------------------------------------------------ 941// Ratios for luma in a gamma space, using BT.709 luma. 942#define STP_LUMA_R 0.2126 943#define STP_LUMA_G 0.7152 944#define STP_LUMA_B 0.0722 945#define STP_LUMA STP_LUMA_R, STP_LUMA_G, STP_LUMA_B 946//------------------------------------------------------------------------------------------------------------------------------ 947// Maximum frames of feedback. 948#define STP_FRAME_MAX 32.0 949//------------------------------------------------------------------------------------------------------------------------------ 950// Control the min (motion match), and max (no motion match), in units of pixels. 951// Settings of {max=1.0} won't work for 8x area scaling (trailing edge smears). 952// Setting too tight won't have enough slop for motion matching (motion match easily fails, leading to loss of detail). 953// If STP_PAT_MOT_MAX is too big, it will look like edges expand (or float) during change of motion. 954#define STP_PAT_MOT_MIN (1.0 / 16.0) 955#define STP_PAT_MOT_MAX (1.0 / 8.0) 956// Computed constants. 957#define STP_PAT_MOT_ADD (STP_PAT_MOT_MIN * STP_PAT_MOT_MIN) 958#define STP_PAT_MOT_AMP (1.0 / (STP_PAT_MOT_MAX * STP_PAT_MOT_MAX - STP_PAT_MOT_ADD)) 959//------------------------------------------------------------------------------------------------------------------------------ 960// Larger numbers ghost more, smaller numbers flicker more. 961#define STP_PAT_DEMOIRE 64.0 962// Increase for less ghosting, decrease for more ghosting. 963#define STP_PAT_SENSITIVITY (2.0 / 16.0) 964// Amount to scale up sensitivity on responsive. Lower numbers ghost more, higher flicker more. 965#define STP_PAT_RESPONSIVE 16.0 966// Minimum neighborhood (defaults to 1/32 of maximum value of neighborhood to allow some noise). 967#define STP_PAT_NE_MIN (1.0 / 32.0) 968//------------------------------------------------------------------------------------------------------------------------------ 969// {0} = default lowest dilation (higher chance of slight trailing ghost, but less overall flicker) 970// {1} = expand a little (higher cost) 971// {2} = expand by too much (a lot more cost, more flicker, perhaps less trailing ghost) 972// In practice it's dilation and motion match threshold (PAT_MOT) which results in the final {flicker, ghost} tradeoff. 973#define STP_SAFE_DILATE 1 974//------------------------------------------------------------------------------------------------------------------------------ 975// Adjusts the point at which spatial-only weights blend up and anti-flicker fully takes over. 976#define STP_TAA_SAA (1.0 / 2.0) 977// De-weight pixel contribution for chopped corner. 978#define STP_TAA_TRI_MASK_AVOID (1.0 / 8192.0) 979//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 980//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 981//_____________________________________________________________.._______________________________________________________________ 982//============================================================================================================================== 983// JITTER LOCATIONS 984//------------------------------------------------------------------------------------------------------------------------------ 985// STP is now using Halton(2,3). 986//============================================================================================================================== 987// Generate jitter amount given frame index. 988STP_STATIC void StpJit(StpOutF2 p, StpU1 frame) { 989 // TODO: This function isn't used inside Unity, if ever this is used the implementation should be added here. 990 p[0] = StpF1_(0.0); 991 p[1] = StpF1_(0.0); } 992//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 993//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 994//_____________________________________________________________.._______________________________________________________________ 995//============================================================================================================================== 996// PARABOLIC {SIN,COS} 997//============================================================================================================================== 998#if defined(STP_GPU) 999 // Input is {-1 to 1} representing {0 to 2 pi}, output is {-1/4 to 1/4} representing {-1 to 1}. 1000 void StpPSinF2(inout StpF2 p) { p = p * abs(p) - p; } 1001 // This is used to dither position of gather4 fetch for nearest motion vector to remove nearest artifacts when scaling. 1002 // Input 'p.x' is {0 to 1} representing {0 to 2 pi}, output is {-1/4 to 1/4} representing {-1 to 1}. 1003 void StpPSinCosF(inout StpF2 p) { p.y = StpFractF1(p.x + StpF1_(0.25)); p = p * StpF2_(2.0) - StpF2_(1.0); StpPSinF2(p); } 1004//------------------------------------------------------------------------------------------------------------------------------ 1005 void StpPSinMF2(inout StpMF2 p) { p = p * abs(p) - p; } 1006 void StpPSinCosMF(inout StpMF2 p) { 1007 p.y = StpFractMF1(p.x + StpMF1_(0.25)); 1008 p = p * StpMF2_(2.0) - StpMF2_(1.0); StpPSinMF2(p); } 1009#endif // defined(STP_GPU) 1010//============================================================================================================================== 1011#if defined(STP_GPU) && defined(STP_16BIT) 1012 void StpPSinH2(inout StpH2 p) { p = p * abs(p) - p; } 1013 void StpPSinCosH(inout StpH2 p) { p.y = StpFractH1(p.x + StpH1_(0.25)); p = p * StpH2_(2.0) - StpH2_(1.0); StpPSinH2(p); } 1014#endif // defined(STP_GPU) && defined(STP_16BIT) 1015//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1016//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1017//_____________________________________________________________.._______________________________________________________________ 1018//============================================================================================================================== 1019// DEPTH ENCODING 1020//------------------------------------------------------------------------------------------------------------------------------ 1021// Using a log2() based encoding, takes {0 to inf} to {0 to 1}. 1022// log2(k.x*z)*k.y 1023// Where 1024// k.x = 1/near ............ (so that k0*z is 1 when z=near) 1025// k.y = 1/log2(k.x*far) ... (so that output is {0 to 1} ranged) 1026//------------------------------------------------------------------------------------------------------------------------------ 1027// And the inverse 1028// exp2(x*k.x)*k.y 1029// Where 1030// k.x = log2(far/near) 1031// k.y = near 1032//============================================================================================================================== 1033#if defined(STP_GPU) 1034 // Build the constants, based on near and far planes. 1035 // The 'far' is where anything more distant clamps to 1.0. 1036 StpF2 StpZCon(StpF1 near, StpF1 far) { 1037 StpF2 k; 1038 k.x = StpRcpF1(near); 1039 k.y = StpRcpF1(log2(k.x * far)); 1040 return k; } 1041//------------------------------------------------------------------------------------------------------------------------------ 1042 // Where 'k' is generated by StpZCon(). 1043 StpF1 StpZPack(StpF1 z, StpF2 k, StpF1 dit) { 1044 #if (STP_DITHER_DEPTH == 0) 1045 return StpSatF1(log2(k.x * z) * k.y); 1046 #endif // (STP_DITHER_DEPTH == 0) 1047 #if (STP_DITHER_DEPTH == 1) 1048 // Fast linearly incorrect dither for 10-bit. 1049 return StpSatF1(log2(k.x * z) * k.y + dit * StpF1_(1.0 / 1024.0) - StpF1_(0.5 / 1024.0)); 1050 #endif // (STP_DITHER_DEPTH == 1) 1051 } 1052//============================================================================================================================== 1053 // Build the constants, based on near and far planes. 1054 // The 'far' is where anything more distant clamps to 1.0. 1055 StpF2 StpZUnCon(StpF1 near, StpF1 far) { 1056 StpF2 k; 1057 k.x = log2(far * StpRcpF1(near)); 1058 k.y = near; 1059 return k; } 1060//------------------------------------------------------------------------------------------------------------------------------ 1061 // Where 'k' is generated by StpZUnCon(). 1062 StpF1 StpZUnpack(StpF1 x, StpF2 k) { return exp2(x * k.x) * k.y; } 1063#endif // defined(STP_GPU) 1064//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1065//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1066//_____________________________________________________________.._______________________________________________________________ 1067//============================================================================================================================== 1068// STATIC GEOMETRY MOTION FORWARD PROJECTION 1069//============================================================================================================================== 1070// This is a separate section simply for documentation. 1071// This logic must be computed in 32-bit precision (in theory). 1072//------------------------------------------------------------------------------------------------------------------------------ 1073// MOTION MATCH NOTES 1074// ================== 1075// - The 'position - motion' is the reprojected position. 1076// - Where {0 to 1} is no motion to a screen in motion. 1077// - Motion check works with a differential vector '((motionPrior - motionCurrent) * kC)'. 1078// - For static forward projection it will be '((motionPrior*0.5 - motionCurrent) * kC)'. 1079// - Due to motionPrior being in {-1 to 1} NDC instead of {0 to 1} for screen. 1080// - Working with motion vector differences to avoid complexity with jitter. 1081//------------------------------------------------------------------------------------------------------------------------------ 1082// MOTION VECTOR NOTES 1083// =================== 1084// - 'reprojection = position - motion' 1085// - 'reprojection + motion = position' 1086// - 'motion = position - reprojection' 1087// - So motion points forward. 1088//------------------------------------------------------------------------------------------------------------------------------ 1089// FORWARD PROJECTION LOGIC 1090// ======================== 1091// HAVE INPUT {0 TO 1} SCREEN POSITION 1092// xy 1093// GET XY INTO {-1 TO 1} NDC [2 FMA, CANNOT FACTOR, NEED AT END] 1094// x=x*2-1 1095// y=y*2-1 1096// HAVE INPUT {0 TO INF} DEPTH 1097// z 1098// GET FROM {XY NDC, DEPTH} TO 3D VIEW POSITION [4 FMA] 1099// xx=x*((z*g+h)/a) ... xx=x*(z*(g/a)+(h/a)) ... xx=x*(z*k0+k1) 1100// yy=y*((z*g+h)/b) ... yy=y*(z*(g/b)+(h/b)) ... yy=y*(z*k2+k3) 1101// TRANSFORM TO NEW VIEW 1102// xxx=xx*i+yy*j+z*k+l 1103// yyy=xx*m+yy*n+z*o+p 1104// zzz=xx*q+yy*r+z*s+t 1105// PROJECTION [9 FMA] 1106// xxxx=xxx*a ..... xxxx=xx*(i*a)+yy*(j*a)+z*(k*a)+(l*a) ..... xxxx=xx*k4+yy*k5+z*k6+k7 1107// yyyy=yyy*b ..... yyyy=xx*(m*b)+yy*(n*b)+z*(o*b)+(p*b) ..... yyyy=xx*k8+yy*k9+z*kA+kB 1108// wwww=zzz*g+h ... wwww=xx*(q*g)+yy*(r*g)+z*(s*g)+(t*g+h) ... wwww=xx*kC+yy*kD+z*kE+kF 1109// PERSPECTIVE DIVIDE [1 RCP] 1110// xxxxx=xxxx/wwww 1111// yyyyy=yyyy/wwww 1112// SUBTRACT TO GET 2X MOTION [2 FMA] 1113// u=xxxxx-x ... u=xxxx*(1/wwww)-x 1114// v=yyyyy-y ... v=yyyy*(1/wwww)-y 1115// CONSTANTS (SEE BELOW FOR MEANING OF VARIABLES) 1116// k0=g/a ... Constants {a,b,c,d,g,h} for prior projection 1117// k1=h/a 1118// k2=g/b 1119// k3=h/b 1120// k4=i*a ... Constants {a,b,c,d,g,h} for next projection 1121// k5=j*a 1122// k6=k*a 1123// k7=l*a 1124// k8=m*b 1125// k9=n*b 1126// kA=o*b 1127// kB=p*b 1128// kC=q*g 1129// kD=r*g 1130// kE=s*g 1131// kF=t*g+h 1132//------------------------------------------------------------------------------------------------------------------------------ 1133// BACKWARD PROJECTION LOGIC 1134// ========================= 1135// This starts from '3D VIEW POSITION' of 'FORWARD PROJECTION LOGIC', but with different constants. 1136// TRANSFORM TO NEW VIEW 1137// xxx=xx*i+yy*j+z*k+l 1138// yyy=xx*m+yy*n+z*o+p 1139// zzz=xx*q+yy*r+z*s+t 1140// PROJECTION [9 FMA] 1141// xxxx=xxx*a ..... xxxx=xx*(i*a)+yy*(j*a)+z*(k*a)+(l*a) ..... xxxx=xx*kG+yy*kH+z*kI+kJ 1142// yyyy=yyy*b ..... yyyy=xx*(m*b)+yy*(n*b)+z*(o*b)+(p*b) ..... yyyy=xx*kK+yy*kL+z*kM+kN 1143// wwww=zzz*g+h ... wwww=xx*(q*g)+yy*(r*g)+z*(s*g)+(t*g+h) ... wwww=xx*kO+yy*kP+z*kQ+kR 1144// PERSPECTIVE DIVIDE [1 RCP] 1145// xxxxx=xxxx/wwww 1146// yyyyy=yyyy/wwww 1147// SUBTRACT TO GET 2X MOTION [2 FMA] 1148// u=xxxxx-x ... u=xxxx*(1/wwww)-x 1149// v=yyyyy-y ... v=yyyy*(1/wwww)-y 1150// CONSTANTS (SEE BELOW FOR MEANING OF VARIABLES) 1151// kG=i*a ... Constants {a,b,c,d,g,h} for previous prior projection, and {i,j,k,l,m,n,o,p,q,r,s,t} for prior back projection 1152// kH=j*a 1153// kI=k*a 1154// kJ=l*a 1155// kK=m*b 1156// kL=n*b 1157// kM=o*b 1158// kN=p*b 1159// kO=q*g 1160// kP=r*g 1161// kQ=s*g 1162// kR=t*g+h 1163//============================================================================================================================== 1164// GET FROM {0 TO 1} TO {-1 TO 1} 1165// ============================== 1166// - Get to NDC for {x,y} 1167// X:=x*2-1 1168// Y:=y*2-1 1169//------------------------------------------------------------------------------------------------------------------------------ 1170// FORWARD VIEW 1171// ============ 1172// - Using 12 values 1173// X:=x*i+y*j+z*k+l 1174// Y:=x*m+y*n+z*o+p 1175// Z:=x*q+y*r+z*s+t 1176// W:=1 1177// i j k l 1178// m n o p 1179// q r s t 1180// 0 0 0 1 1181//------------------------------------------------------------------------------------------------------------------------------ 1182// PROJECTIONS 1183// =========== 1184// - INPUTS 1185// n ... near plane z 1186// f ... far plane z 1187// - DX ORTHO PROJECTION 1188// c:=1/(f-n) 1189// d:=-n/(f-n) 1190// X:=x*a 1191// Y:=y*b 1192// Z:=z*c+d ... (w=1 on input) 1193// W:=1 1194// a 0 0 0 1195// 0 b 0 0 1196// 0 0 c d 1197// 0 0 0 1 1198// - DX PERSPECTIVE PROJECTION (LEFT HANDED) 1199// c:=f/(f-n) 1200// d:=-(f*n)/(f-n) 1201// X:=x*a 1202// Y:=y*b 1203// Z:=z*c+d ... (w=1 on input) 1204// W:=z 1205// a 0 0 0 1206// 0 b 0 0 1207// 0 0 c d 1208// 0 0 1 0 ... (note DX allows the 1 to be non-one) 1209// - DX PERSPECTIVE PROJECTION REVERSED FOR BETTER PRECISION (LEFT HANDED) 1210// c:=-n/(f-n) 1211// d:=(f*n)/(f-n) 1212// X:=x*a 1213// Y:=y*b 1214// Z:=z*c+d ... (w=1 on input) 1215// W:=z 1216// a 0 0 0 1217// 0 b 0 0 1218// 0 0 c d 1219// 0 0 1 0 1220// - DX PERSPECTIVE PROJECTION REVERSED WITH INF FAR (LEFT HANDED) 1221// X:=x*a 1222// Y:=y*b 1223// Z:=n ... (w=1 on input) 1224// W:=z 1225// a 0 0 0 1226// 0 b 0 0 1227// 0 0 0 n 1228// 0 0 1 0 1229// - GL PERSPECTIVE PROJECTION 1230// c:=-(f+n)/(f-n) 1231// d:=-(2fn)/(f-n) 1232// X:=x*a 1233// Y:=y*b 1234// Z:=z*c+d ... (w=1 on input) 1235// W:=z 1236// a 0 0 0 1237// 0 b 0 0 1238// 0 0 c d 1239// 0 0 -1 0 1240// - GENERALIZED (WILL DO ANYTHING) 1241// X:=x*a 1242// Y:=y*b 1243// Z:=z*c+d ... (w=1 on input) 1244// W:=z*g+h 1245// a 0 0 0 1246// 0 b 0 0 1247// 0 0 c d 1248// 0 0 g h 1249//------------------------------------------------------------------------------------------------------------------------------ 1250// PROJECTED TO NDC 1251// ================ 1252// - Ignoring viewport transform 1253// X:=x/w 1254// Y:=y/w 1255// Z:=z/w 1256// W:=1/w 1257// - Inverse 1258// x=X*w 1259// y=Y*w 1260//============================================================================================================================== 1261// MODIFICATIONS FOR COMPLEX PROJECTIONS 1262//------------------------------------------------------------------------------------------------------------------------------ 1263// Since this worked out to just 2 more FMAs and 2 more constants, decided not to create a shader permutation. 1264//============================================================================================================================== 1265// COMPLEX PROJECTION 1266// ================== 1267// - GL PERSPECTIVE PROJECTION - WITH Z BASED {X,Y} MODIFICATIONS 1268// c:=-(F+N)/(F-N) 1269// d:=-(2FN)/(F-N) 1270// X:=x*a + z*e 1271// Y:=y*b + z*f 1272// Z:=z*c+d ... (w=1 on input) 1273// W:=z 1274// a 0 e 0 1275// 0 b f 0 1276// 0 0 c d 1277// 0 0 -1 0 1278// - GENERALIZED (WILL DO ANYTHING) - WITH Z BASED {X,Y} MODIFICATIONS 1279// X:=x*a + z*e 1280// Y:=y*b + z*f 1281// Z:=z*c+d ... (w=1 on input) 1282// W:=z*g+h 1283// a 0 e 0 1284// 0 b f 0 1285// 0 0 c d 1286// 0 0 g h 1287// - INVERSE GIVEN 'z' 1288// X:=x*a + z*e 1289// Y:=y*b + z*f 1290// X - z*e:=x*a 1291// Y - z*f:=y*b 1292// X/a - z*e/a:=x 1293// Y/b - z*f/b:=y 1294//------------------------------------------------------------------------------------------------------------------------------ 1295// FORWARD PROJECTION LOGIC 1296// ======================== 1297// HAVE INPUT {0 TO 1} SCREEN POSITION 1298// xy 1299// GET XY INTO {-1 TO 1} NDC [2 FMA, CANNOT FACTOR, NEED AT END] 1300// x=x*2-1 1301// y=y*2-1 1302// HAVE INPUT {0 TO INF} DEPTH 1303// z 1304// GET FROM {XY NDC, DEPTH} TO 3D VIEW POSITION [4 FMA] 1305// ... have {X,Y,z} 1306// ... xx=(x*(z*g+h))*(1/a) + z*(e/a) 1307// ... yy=(y*(z*g+h))*(1/b) + z*(f/b) 1308// ... xx=x*((z*g+h)/a) + z*(e/a) 1309// ... yy=y*((z*g+h)/b) + z*(f/b) 1310// ... xx=x*(z*(g/a)+(h/a)) + z*(e/a) 1311// ... yy=y*(z*(g/b)+(h/b)) + z*(f/b) 1312// xx=x*(z*k0+k1)+z*k2 1313// yy=y*(z*k3+k4)+z*k5 1314// TRANSFORM TO NEW VIEW 1315// xxx=xx*i+yy*j+z*k+l 1316// yyy=xx*m+yy*n+z*o+p 1317// zzz=xx*q+yy*r+z*s+t 1318// PROJECTION [9 FMA] 1319// xxxx=xxx*a+zzz*e 1320// ... xxxx=xx*(i*a)+yy*(j*a)+z*(k*a)+(l*a) + xx*(q*e)+yy*(r*e)+z*(s*e)+(t*e) 1321// ... xxxx=xx*k6+yy*k7+z*k8+k9 1322// yyyy=yyy*b+zzz*f 1323// ... yyyy=xx*(m*b)+yy*(n*b)+z*(o*b)+(p*b) + xx*(q*f)+yy*(r*f)+z*(s*f)+(t*f) 1324// ... yyyy=xx*kA+yy*kB+z*kC+kD 1325// wwww=zzz*g+h 1326// ... wwww=xx*(q*g)+yy*(r*g)+z*(s*g)+(t*g+h) 1327// ... wwww=xx*kE+yy*kF+z*kG+kH 1328// PERSPECTIVE DIVIDE [1 RCP] 1329// xxxxx=xxxx/wwww 1330// yyyyy=yyyy/wwww 1331// SUBTRACT TO GET 2X MOTION [2 FMA] 1332// u=xxxxx-x ... u=xxxx*(1/wwww)-x 1333// v=yyyyy-y ... v=yyyy*(1/wwww)-y 1334// CONSTANTS (SEE BELOW FOR MEANING OF VARIABLES) 1335// k0=g/a ... Constants {a,b,c,d,e,f,g,h} for prior projection 1336// k1=h/a 1337// k2=e/a 1338// k3=g/b 1339// k4=h/b 1340// k5=f/b 1341// k6=(i*a)+(q*e) ... Constants {a,b,c,d,e,f,g,h} for next projection 1342// k7=(j*a)+(r*e) 1343// k8=(k*a)+(s*e) 1344// k9=(l*a)+(t*e) 1345// kA=(m*b)+(q*f) 1346// kB=(n*b)+(r*f) 1347// kC=(o*b)+(s*f) 1348// kD=(p*b)+(t*f) 1349// kE=q*g 1350// kF=r*g 1351// kG=s*g 1352// kH=t*g+h 1353//------------------------------------------------------------------------------------------------------------------------------ 1354// BACKWARD PROJECTION LOGIC 1355// ========================= 1356// This starts from '3D VIEW POSITION' of 'FORWARD PROJECTION LOGIC', but with different constants. 1357// TRANSFORM TO NEW VIEW 1358// xxx=xx*i+yy*j+z*k+l 1359// yyy=xx*m+yy*n+z*o+p 1360// zzz=xx*q+yy*r+z*s+t 1361// PROJECTION [9 FMA] 1362// xxxx=xxx*a+zzz*e 1363// ..... xxxx=xx*(i*a)+yy*(j*a)+z*(k*a)+(l*a) + xx*(q*e)+yy*(r*e)+z*(s*e)+(t*e) 1364// ..... xxxx=xx*kI+yy*kJ+z*kK+kJL 1365// yyyy=yyy*b+zzz*f 1366// ..... yyyy=xx*(m*b)+yy*(n*b)+z*(o*b)+(p*b) + xx*(q*f)+yy*(r*f)+z*(s*f)+(t*f) 1367// ..... yyyy=xx*kM+yy*kN+z*kO+kP 1368// wwww=zzz*g+h 1369// ... wwww=xx*(q*g)+yy*(r*g)+z*(s*g)+(t*g+h) 1370// ... wwww=xx*kQ+yy*kR+z*kS+kT 1371// PERSPECTIVE DIVIDE [1 RCP] 1372// xxxxx=xxxx/wwww 1373// yyyyy=yyyy/wwww 1374// SUBTRACT TO GET 2X MOTION [2 FMA] 1375// u=xxxxx-x ... u=xxxx*(1/wwww)-x 1376// v=yyyyy-y ... v=yyyy*(1/wwww)-y 1377// CONSTANTS (SEE BELOW FOR MEANING OF VARIABLES) 1378// ... Constants {a,b,c,d,e,f,g,h} for previous prior projection 1379// ... Constants {i,j,k,l,m,n,o,p,q,r,s,t} for prior back projection 1380// kI=(i*a)+(q*e) 1381// kJ=(j*a)+(r*e) 1382// kK=(k*a)+(s*e) 1383// kL=(l*a)+(t*e) 1384// kM=(m*b)+(q*f) 1385// kN=(n*b)+(r*f) 1386// kO=(o*b)+(s*f) 1387// kP=(p*b)+(t*f) 1388// kQ=q*g 1389// kR=r*g 1390// kS=s*g 1391// kT=t*g+h 1392//============================================================================================================================== 1393#if defined(STP_GPU) 1394 // Generates forward {-1 to 1} NDC forward projection vectors given (from prior frame), 1395 // p .... {0 to 1} screen position 1396 // z .... {0 to INF} depth 1397 // m .... {0 to 1} prior motion vector 1398 // The results are approximately corrected for dynamic motion. 1399 // This takes 'dynamicMotion = priorMotionVector - priorStaticGeometryBackprojection' 1400 // Then adds that estimate of dynamic motion to the static geometry forward projection vector. 1401 StpF2 StpFor(StpF2 p, StpF1 z, StpF2 m, StpF1 kMotionMatch, 1402 StpF4 k0123, StpF4 k4567, StpF4 k89AB, StpF4 kCDEF, StpF4 kGHIJ, StpF4 kKLMN, StpF4 kOPQR, StpF2 kST, 1403 out StpF2 bugF, out StpF2 bugD){ 1404 // Implements the logic described above in the comments. 1405 p = p * StpF2_(2.0) - StpF2_(1.0); 1406 StpF2 q; 1407 q.x = p.x * (z * k0123.x + k0123.y) + (z * k0123.z); 1408 q.y = p.y * (z * k0123.w + k4567.x) + (z * k4567.y); 1409 StpF3 v; 1410 v.x = q.x * k4567.z + q.y * k4567.w + z * k89AB.x + k89AB.y; 1411 v.y = q.x * k89AB.z + q.y * k89AB.w + z * kCDEF.x + kCDEF.y; 1412 v.z = q.x * kCDEF.z + q.y * kCDEF.w + z * kGHIJ.x + kGHIJ.y; 1413 v.z = StpRcpF1(v.z); 1414 StpF3 v2; 1415 v2.x = q.x * kGHIJ.z + q.y * kGHIJ.w + z * kKLMN.x + kKLMN.y; 1416 v2.y = q.x * kKLMN.z + q.y * kKLMN.w + z * kOPQR.x + kOPQR.y; 1417 v2.z = q.x * kOPQR.z + q.y * kOPQR.w + z * kST.x + kST.y; 1418 v2.z = StpRcpF1(v2.z); 1419 // Motion vector points forward (to estimated position in next frame). 1420 // Negative motion vector points back to where the pixel was in the prior frame. 1421 // Motion vector is {0 to 1} for one screen, but this logic is {-1 to 1} based (hence a 2x scaling). 1422 bugF = (v.xy * StpF2_(v.z) - p); // Static forward estimate. 1423 bugD = ((StpF2_(2.0) * m) - (p - v2.xy * StpF2_(v2.z))) * StpF2_(kMotionMatch); // Dynamic estimate. 1424 return bugF + bugD; } 1425#endif // defined(STP_GPU) 1426//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1427//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1428//_____________________________________________________________.._______________________________________________________________ 1429//============================================================================================================================== 1430// MOTION VECTOR ENCODING 1431//------------------------------------------------------------------------------------------------------------------------------ 1432// {MSB 10-bit depth, LSB {11,11}-bit motion with sqrt() encoding} 1433// Motion is encoding in sqrt() space. 1434//------------------------------------------------------------------------------------------------------------------------------ 1435// 11111111111111110000000000000000 1436// fedcba9876543210fedcba9876543210 1437// ================================ 1438// zzzzzzzzzz...................... 10-bit encoded z 1439// ..........yyyyyyyyyyy........... 11-bit {-1 to <1} y encoded in gamma 2.0 (sqrt) 1440// .....................xxxxxxxxxxx 11-bit {-1 to <1} x encoded in gamma 2.0 (sqrt) 1441//------------------------------------------------------------------------------------------------------------------------------ 1442// The 32-bit path is 8 ops to decode {x,y}. 1443//------------------------------------------------------------------------------------------------------------------------------ 1444// There once was a 16-bit path which takes 6 ops to decode (bit extra because ABS isn't free). 1445// hhhhhhhhhhhhhhhhllllllllllllllll 1446// ================================ 1447// zzzzzzzzzzyyyyyyyyyyyxxxxxxxxxxx input 1448// zzzzzyyyyyyyyyyyxxxxxxxxxxx00000 << 5 1449// 00000yyyyyyyyyyyxxxxxxxxxxx00000 & 0x7FFFFFF 1450// 00000yyyyyyyyyyy00000xxxxxxxxxxx >> 5 (for 16-bit LSB only) 1451// This gets 11-bit integers which perfectly alias lowest non-denormal and denormals of FP16. 1452// Can scale by '16384' and subtract 1 to decompress without a CVT. 1453//============================================================================================================================== 1454#if defined(STP_GPU) 1455 // The 'z' comes in {0 to 1}. 1456 // This depends on 'v' ranging inside and including {-1 to 1}. 1457 StpU1 StpMvPack(StpF1 z, StpF2 v, StpF1 dit) { 1458 // {-1 to 1} linear to gamma 2.0 {-1 to 1} 1459 #if STP_DITHER_MOTION 1460 v = StpCpySgnF2(StpSatF2(sqrt(abs(v)) + StpF2_(dit * StpF1_(1.0 / 1024.0) - StpF1_(0.5 / 1024.0))), v); 1461 #else 1462 v = StpCpySgnF2(sqrt(abs(v)), v); 1463 #endif 1464 // Limit to {-1024/1024 to 1023/1024}. 1465 v = min(v, StpF2_(1023.0/1024.0)); 1466 // Encode to 11-bit with zero at center of one step. 1467 v = v * StpF2_(1024.0) + StpF2_(1024.0); 1468 // Pack. 1469 return (StpU1(z * StpF1(1023.0)) << StpU1(22)) + (StpU1(v.y) << StpU1(11)) + StpU1(v.x); } 1470//------------------------------------------------------------------------------------------------------------------------------ 1471 // Unpacks all. 1472 void StpMvUnpack(out StpF1 z, out StpF2 v, StpU1 i) { 1473 StpU1 iz = StpBfeU1(i, 22u, 10u); 1474 StpU1 iy = StpBfeU1(i, 11u, 11u); 1475 StpU1 ix = StpBfeU1(i, 0, 11u); 1476 z = StpF1(iz) * StpF1_(1.0 / 1023.0); 1477 v.y = StpF1(iy) * StpF1_(1.0 / 1024.0) + StpF1_(-1.0); 1478 v.x = StpF1(ix) * StpF1_(1.0 / 1024.0) + StpF1_(-1.0); 1479 v *= abs(v); } 1480//------------------------------------------------------------------------------------------------------------------------------ 1481 // Unpack just velocity. 1482 void StpMvUnpackV(out StpF2 v, StpU1 i) { 1483 StpU1 iy = StpBfeU1(i, 11u, 11u); 1484 StpU1 ix = StpBfeU1(i, 0, 11u); 1485 v.y = StpF1(iy) * StpF1_(1.0 / 1024.0) + StpF1_(-1.0); 1486 v.x = StpF1(ix) * StpF1_(1.0 / 1024.0) + StpF1_(-1.0); 1487 v *= abs(v); } 1488#endif // defined(STP_GPU) 1489//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1490//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1491//_____________________________________________________________.._______________________________________________________________ 1492//============================================================================================================================== 1493// COLOR CONVERSION 1494//============================================================================================================================== 1495#if defined(STP_GPU) 1496 // Scaling in the reversible tonemapper (should be >= 1). 1497 // Getting too close to 1.0 will result in luma inversions in highly saturated content in the oldest algorithm. 1498 // Using 4.0 or ideally 8.0 is recommended. 1499 #define STP_SAT 4.0 1500#endif // defined(STP_GPU) 1501//============================================================================================================================== 1502#if defined(STP_GPU) && defined(STP_32BIT) 1503 void StpToneF1(inout StpF1 x) { StpF1 y = StpRcpF1(StpF1_(STP_SAT) + x); x = StpSatF1(x * StpF1_(y)); } 1504//------------------------------------------------------------------------------------------------------------------------------ 1505 // Reversible tonemapper. 1506 void StpToneF3(inout StpF3 x) { 1507 StpF1 y = StpRcpF1(StpF1_(STP_SAT) + StpMax3F1(x.r, x.g, x.b)); 1508 x = StpSatF3(x * StpF3_(y)); } 1509//------------------------------------------------------------------------------------------------------------------------------ 1510 void StpToneInvF3(inout StpF3 x) { 1511 StpF1 y = StpRcpF1( 1512 // |-----| <- Using 32768.0 causes problems in Unity with bloom on at least some platforms. 1513 // | | So output maximum is 16384 for StpToneInvF3(). 1514 max(StpF1_(1.0 / 16384.0), StpSatF1(StpF1_(1.0 / STP_SAT) - StpMax3F1(x.r, x.g, x.b) * StpF1_(1.0 / STP_SAT)))); 1515 x *= StpF3_(y); } 1516//------------------------------------------------------------------------------------------------------------------------------ 1517 // This is currently unused but left in for reference. 1518 // Convert LDR RGB to Gamma 2.0 RGB {0 to 1}. 1519 // This is for storage to 8-bit. 1520 // This is temporal dithered. 1521 // Unoptimized logic (for reference). 1522 // StpF3 n = sqrt(c); 1523 // n = floor(n * StpF3_(255.0)) * StpF3_(1.0 / 255.0); 1524 // StpF3 a = n * n; 1525 // StpF3 b = n + StpF3_(1.0 / 255.0); b = b * b; 1526 // // Ratio of 'a' to 'b' required to produce 'c'. 1527 // StpF3 r = (c - b) * StpRcpF3(a - b); 1528 // // Use the ratio as a cutoff to choose 'a' or 'b'. 1529 // c = StpSatF3(n + StpGtZeroF3(StpF3_(dit) - r) * StpF3_(1.0 / 255.0)); 1530 // Optimized from 57 to 42 clks on GCN. 1531 StpF3 StpRgbGamDit8F3(StpF3 c, StpF1 dit) { 1532 StpF3 n = sqrt(c); 1533 n = floor(n * StpF3_(255.0)) * StpF3_(1.0 / 255.0); 1534 StpF3 a = n * n; 1535 StpF3 b = n + StpF3_(1.0 / 255.0); 1536 c = StpSatF3(n + StpGtZeroF3(StpF3_(dit) * (b * b - a) - (b * b - c)) * StpF3_(1.0 / 255.0)); return c; } 1537//------------------------------------------------------------------------------------------------------------------------------ 1538 // This is currently unused but left in for reference. 1539 // Version for 10-bit for feedback. 1540 StpF3 StpRgbGamDit10F3(StpF3 c, StpF1 dit) { 1541 StpF3 n = sqrt(c); 1542 n = floor(n * StpF3_(1023.0)) * StpF3_(1.0 / 1023.0); 1543 StpF3 a = n * n; 1544 StpF3 b = n + StpF3_(1.0 / 1023.0); 1545 c = StpSatF3(n + StpGtZeroF3(StpF3_(dit) * (b * b - a) - (b * b - c)) * StpF3_(1.0 / 1023.0)); return c; } 1546//------------------------------------------------------------------------------------------------------------------------------ 1547 // Can use this function to convert feedback back to color. 1548 void StpFeed2ClrF(inout StpF3 c) { 1549 c *= c; 1550 #if (STP_POSTMAP == 0) 1551 StpToneInvF3(c.rgb); 1552 #endif 1553 } 1554#endif // defined(STP_GPU) && defined(STP_32BIT) 1555//============================================================================================================================== 1556#if defined(STP_GPU) && defined(STP_32BIT) 1557 void StpToneMF1(inout StpMF1 x) { StpMF1 y = StpRcpMF1(StpMF1_(STP_SAT) + x); x = StpSatMF1(x * StpMF1_(y)); } 1558//------------------------------------------------------------------------------------------------------------------------------ 1559 void StpToneMF3(inout StpMF3 x) { 1560 StpMF1 y = StpRcpMF1(StpMF1_(STP_SAT) + StpMax3MF1(x.r, x.g, x.b)); 1561 x = StpSatMF3(x * StpMF3_(y)); } 1562//------------------------------------------------------------------------------------------------------------------------------ 1563 void StpToneInvMF3(inout StpMF3 x) { 1564 StpMF1 y = StpRcpMF1( 1565 max(StpMF1_(1.0 / 16384.0), StpSatMF1(StpMF1_(1.0 / STP_SAT) - 1566 StpMax3MF1(x.r, x.g, x.b) * StpMF1_(1.0 / STP_SAT)))); 1567 x *= StpMF3_(y); } 1568//------------------------------------------------------------------------------------------------------------------------------ 1569 StpMF3 StpRgbGamDit8MF3(StpMF3 c, StpMF1 dit) { 1570 StpMF3 n = sqrt(c); 1571 n = floor(n * StpMF3_(255.0)) * StpMF3_(1.0 / 255.0); 1572 StpMF3 a = n * n; 1573 StpMF3 b = n + StpMF3_(1.0 / 255.0); 1574 c = StpSatMF3(n + StpGtZeroMF3(StpMF3_(dit) * (b * b - a) - (b * b - c)) * StpMF3_(1.0 / 255.0)); return c; } 1575//------------------------------------------------------------------------------------------------------------------------------ 1576 StpMF3 StpRgbGamDit10MF3(StpMF3 c, StpMF1 dit) { 1577 StpMF3 n = sqrt(c); 1578 n = floor(n * StpMF3_(1023.0)) * StpMF3_(1.0 / 1023.0); 1579 StpMF3 a = n * n; 1580 StpMF3 b = n + StpMF3_(1.0 / 1023.0); 1581 c = StpSatMF3(n + StpGtZeroMF3(StpMF3_(dit) * (b * b - a) - (b * b - c)) * StpMF3_(1.0 / 1023.0)); return c; } 1582//------------------------------------------------------------------------------------------------------------------------------ 1583 void StpFeed2ClrMF(inout StpMF3 c) { 1584 c *= c; 1585 #if (STP_POSTMAP == 0) 1586 StpToneInvMF3(c.rgb); 1587 #endif 1588 } 1589#endif // defined(STP_GPU) && defined(STP_32BIT) 1590//============================================================================================================================== 1591#if defined(STP_GPU) && defined(STP_16BIT) 1592 void StpToneH1(inout StpH1 x) { StpH1 y = StpRcpH1(StpH1_(STP_SAT) + x); x = StpSatH1(x * StpH1_(y)); } 1593//------------------------------------------------------------------------------------------------------------------------------ 1594 void StpToneH3(inout StpH3 x) { 1595 StpH1 y = StpRcpH1(StpH1_(STP_SAT) + StpMax3H1(x.r, x.g, x.b)); 1596 x = StpSatH3(x * StpH3_(y)); } 1597//------------------------------------------------------------------------------------------------------------------------------ 1598 void StpToneInvH3(inout StpH3 x) { 1599 StpH1 y = StpRcpH1( 1600 max(StpH1_(1.0 / 16384.0), StpSatH1(StpH1_(1.0 / STP_SAT) - StpMax3H1(x.r, x.g, x.b) * StpH1_(1.0 / STP_SAT)))); 1601 x *= StpH3_(y); } 1602//------------------------------------------------------------------------------------------------------------------------------ 1603 StpH3 StpRgbGamDit8H3(StpH3 c, StpH1 dit) { 1604 StpH3 n = sqrt(c); 1605 n = floor(n * StpH3_(255.0)) * StpH3_(1.0 / 255.0); 1606 StpH3 a = n * n; 1607 StpH3 b = n + StpH3_(1.0 / 255.0); 1608 c = StpSatH3(n + StpGtZeroH3(StpH3_(dit) * (b * b - a) - (b * b - c)) * StpH3_(1.0 / 255.0)); return c; } 1609//------------------------------------------------------------------------------------------------------------------------------ 1610 StpH3 StpRgbGamDit10H3(StpH3 c, StpH1 dit) { 1611 StpH3 n = sqrt(c); 1612 n = floor(n * StpH3_(1023.0)) * StpH3_(1.0 / 1023.0); 1613 StpH3 a = n * n; 1614 StpH3 b = n + StpH3_(1.0 / 1023.0); 1615 c = StpSatH3(n + StpGtZeroH3(StpH3_(dit) * (b * b - a) - (b * b - c)) * StpH3_(1.0 / 1023.0)); return c; } 1616//------------------------------------------------------------------------------------------------------------------------------ 1617 void StpFeed2ClrH(inout StpH3 c) { 1618 c *= c; 1619 #if (STP_POSTMAP == 0) 1620 StpToneInvH3(c.rgb); 1621 #endif 1622 } 1623#endif // defined(STP_GPU) && defined(STP_16BIT) 1624//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1625//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1626//_____________________________________________________________.._______________________________________________________________ 1627//============================================================================================================================== 1628// COLOR CONVERSION TOOLS 1629//------------------------------------------------------------------------------------------------------------------------------ 1630// Some platforms do not have a hardware sRGB image store (requires manual conversion). 1631//============================================================================================================================== 1632#if defined(STP_GPU) && defined(STP_32BIT) 1633 StpF3 StpLinearToSrgbF3(StpF3 c) { 1634 StpF3 j = StpF3(0.0031308 * 12.92, 12.92, 1.0 / 2.4); StpF2 k = StpF2(1.055, -0.055); 1635 return clamp(j.xxx, c * j.yyy, pow(c, j.zzz) * k.xxx + k.yyy); } 1636//------------------------------------------------------------------------------------------------------------------------------ 1637 StpMF3 StpLinearToSrgbMF3(StpMF3 c) { 1638 StpMF3 j = StpMF3(0.0031308 * 12.92, 12.92, 1.0 / 2.4); StpMF2 k = StpMF2(1.055, -0.055); 1639 return clamp(j.xxx, c * j.yyy, pow(c, j.zzz) * k.xxx + k.yyy); } 1640#endif // defined(STP_GPU) && defined(STP_32BIT) 1641//============================================================================================================================== 1642#if defined(STP_GPU) && defined(STP_16BIT) 1643 StpH3 StpLinearToSrgbH3(StpH3 c) { 1644 StpH3 j = StpH3(0.0031308 * 12.92, 12.92, 1.0 / 2.4); StpH2 k = StpH2(1.055, -0.055); 1645 return clamp(j.xxx, c * j.yyy, pow(c, j.zzz) * k.xxx + k.yyy); } 1646#endif // defined(STP_GPU) && defined(STP_16BIT) 1647//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1648//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1649//_____________________________________________________________.._______________________________________________________________ 1650//============================================================================================================================== 1651// DEBUG COMMON 1652//============================================================================================================================== 1653#if defined(STP_GPU) && STP_BUG 1654 void StpBugF(StpU3 p, StpF4 c); 1655#endif // defined(STP_GPU) && STP_BUG 1656//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1657//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1658//_____________________________________________________________.._______________________________________________________________ 1659//============================================================================================================================== 1660// CONSTANT GENERATION 1661//============================================================================================================================== 1662STP_STATIC void StpDilCon( 1663// Generated constants. 1664StpInOutU4 con0, 1665// Current image resolution in pixels. 1666StpInF2 imgC) { 1667 // StpF2 kRcpR := 4/size of current input image in pixels. 1668 con0[0] = StpU1_F1(StpF1_(4.0) / imgC[0]); 1669 con0[1] = StpU1_F1(StpF1_(4.0) / imgC[1]); 1670 // StpU2 kR := size/4 of the current input image in pixels. 1671 // Used for pass merging (DIL and SAA), since convergence is 1/16 area of input, must check position. 1672 con0[2] = StpU1_(StpU1_(imgC[0]) >> StpU1_(2)); 1673 con0[3] = StpU1_(StpU1_(imgC[1]) >> StpU1_(2)); } 1674//============================================================================================================================== 1675STP_STATIC void StpPatCon( 1676// Generated constants. 1677StpInOutU4 con0, 1678StpInOutU4 con1, 1679StpInOutU4 con2, 1680StpInOutU4 con3, 1681StpInOutU4 con4, 1682StpInOutU4 con5, 1683StpInOutU4 con6, 1684StpInOutU4 con7, 1685StpInOutU4 con8, 1686StpInOutU4 con9, 1687StpInOutU4 conA, 1688StpInOutU4 conB, 1689StpInOutU4 conC, 1690// Linear depth near plane for log2 depth encoding. 1691StpF1 near, 1692// Linear depth far plane for log2 depth encoding. 1693StpF1 far, 1694// Frame count for current frame (sets jitter). 1695StpU1 frame, 1696// Current image resolution in pixels. 1697StpInF2 imgC, 1698// Prior image resolution in pixels. 1699StpInF2 imgP, 1700// Feedback (aka output) resolution in pixels. 1701StpInF2 imgF, 1702// Ratio of 'currentFrameTime/priorFrameTime'. 1703StpF1 motionMatch, 1704// Projection matrix data {a,b,c,d,e,f,g,h}. 1705// This is used to do static geometry forward projection. 1706// a 0 e 0 1707// 0 b f 0 1708// 0 0 c d 1709// 0 0 g h 1710// For reference, an DX ortho projection would be, 1711// a 0 e 0 1712// 0 b f 0 1713// 0 0 c d 1714// 0 0 0 1 1715// And a DX, left handed perspective projection would be, 1716// a 0 e 0 1717// 0 b f 0 1718// 0 0 c d ... c := F/(F-N), d := -(F*N)/(F-N) 1719// 0 0 1 0 1720// Previous prior projection. 1721StpInF4 prjPrvABEF, 1722StpInF4 prjPrvCDGH, 1723// Prior projection. 1724StpInF4 prjPriABEF, 1725StpInF4 prjPriCDGH, 1726// Current projection (the difference enables changing zoom). 1727StpInF4 prjCurABEF, 1728StpInF4 prjCurCDGH, 1729// Forward viewspace transform. 1730// Transform prior 3D view position into current 3D view position. 1731// This is used to do static geometry forward projection. 1732// X := x*i + y*j +z*k +l 1733// Y := x*m + y*n +z*o +p 1734// Z := x*q + y*r +z*s +t 1735// W := 1 1736// i j k l 1737// m n o p 1738// q r s t 1739// 0 0 0 1 1740StpInF4 forIJKL, 1741StpInF4 forMNOP, 1742StpInF4 forQRST, 1743// Prior frame backward viewspace transform. 1744// Transform prior 3D view position into previous-prior 3D view position. 1745// This is used to 'fix' static geometry forward projection for dynamic motion. 1746// X := x*i + y*j +z*k +l 1747// Y := x*m + y*n +z*o +p 1748// Z := x*q + y*r +z*s +t 1749// W := 1 1750// i j k l 1751// m n o p 1752// q r s t 1753// 0 0 0 1 1754StpInF4 bckIJKL, 1755StpInF4 bckMNOP, 1756StpInF4 bckQRST) { 1757//------------------------------------------------------------------------------------------------------------------------------ 1758 // StpF2 kRcpC := 1.0 / size of current input image in pixels. 1759 con0[0] = StpU1_F1(StpF1_(1.0) / imgC[0]); 1760 con0[1] = StpU1_F1(StpF1_(1.0) / imgC[1]); 1761 // StpF2 kHalfRcpC := 0.5 / size of current input image in pixels. 1762 con0[2] = StpU1_F1(StpF1_(0.5) / imgC[0]); 1763 con0[3] = StpU1_F1(StpF1_(0.5) / imgC[1]); 1764//------------------------------------------------------------------------------------------------------------------------------ 1765 // Grab jitter for current and prior frames. 1766 StpVarF2 jitP; 1767 StpVarF2 jitC; 1768 StpJit(jitP, frame - StpU1_(1)); 1769 StpJit(jitC, frame); 1770 // StpF2 kJitCRcpCUnjitPRcpP := Map current into prior frame. 1771 con1[0] = StpU1_F1(jitC[0] / imgC[0] - jitP[0] / imgP[0]); 1772 con1[1] = StpU1_F1(jitC[1] / imgC[1] - jitP[1] / imgP[1]); 1773 // StpF2 kJitCRcpC := Take {0 to 1} position in current image, and map back to {0 to 1} position in feedback (removes jitter). 1774 con1[2] = StpU1_F1(jitC[0] / imgC[0]); 1775 con1[3] = StpU1_F1(jitC[1] / imgC[1]); 1776//------------------------------------------------------------------------------------------------------------------------------ 1777 // StpF2 kF := size of feedback (aka output) in pixels. 1778 con2[0] = StpU1_F1(imgF[0]); 1779 con2[1] = StpU1_F1(imgF[1]); 1780 // StpF2 kDepth := Copied logic from StpZCon(). 1781 StpF1 k0 = StpRcpF1(near); 1782 StpF1 k1 = StpRcpF1(StpLog2F1(k0 * far)); 1783 con2[2] = StpU1_F1(k0); 1784 con2[3] = StpU1_F1(k1); 1785//------------------------------------------------------------------------------------------------------------------------------ 1786 // StpF4 kOS := Scale and bias to check for out of bounds (and kill feedback). 1787 // Scaled and biased output needs to {-1 out of bounds, >-1 in bounds, <1 in bounds, 1 out of bounds}. 1788 StpVarF2 s; 1789 // Undo 'pM' scaling, and multiply by 2 (as this needs to be -1 to 1 at edge of acceptable reprojection). 1790 s[0] = StpF1_(2.0); 1791 s[1] = StpF1_(2.0); 1792 // Scaling to push outside safe reprojection over 1. 1793 s[0] *= imgP[0] / (imgP[0] + StpF1_(4.0)); 1794 s[1] *= imgP[1] / (imgP[1] + StpF1_(4.0)); 1795 con3[0] = StpU1_F1(s[0]); 1796 con3[1] = StpU1_F1(s[1]); 1797 // Factor out subtracting off the mid point scaled by the multiply term. 1798 con3[2] = StpU1_F1(StpF1_(-0.5) * s[0]); 1799 con3[3] = StpU1_F1(StpF1_(-0.5) * s[1]); 1800//------------------------------------------------------------------------------------------------------------------------------ 1801 // StpF2 kUnDepth := Copied logic from StpZUnCon(). 1802 con4[0] = StpU1_F1(StpLog2F1(far * StpRcpF1(near))); 1803 con4[1] = StpU1_F1(near); 1804 // kMotionMatch 1805 con4[2] = StpU1_F1(motionMatch); 1806 // Unused for now. 1807 con4[3] = StpU1_(0); 1808//------------------------------------------------------------------------------------------------------------------------------ 1809 // StpF2 kC := Size of current input image in pixels. 1810 con5[0] = StpU1_F1(imgC[0]); 1811 con5[1] = StpU1_F1(imgC[1]); 1812 // kST 1813 con5[2] = StpU1_F1(bckQRST.z * prjPrvCDGH.z); 1814 con5[3] = StpU1_F1(bckQRST.w * prjPrvCDGH.z + prjPrvCDGH.w); 1815//------------------------------------------------------------------------------------------------------------------------------ 1816 // See header docs in "STATIC GEOMETRY MOTION FORWARD PROJECTION". 1817 // k0123 1818 con6[0] = StpU1_F1(prjPriCDGH.z / prjPriABEF.x); 1819 con6[1] = StpU1_F1(prjPriCDGH.w / prjPriABEF.x); 1820 con6[2] = StpU1_F1(prjPriABEF.z / prjPriABEF.x); 1821 con6[3] = StpU1_F1(prjPriCDGH.z / prjPriABEF.y); 1822 // k4567 1823 con7[0] = StpU1_F1(prjPriCDGH.w / prjPriABEF.y); 1824 con7[1] = StpU1_F1(prjPriABEF.w / prjPriABEF.y); 1825 con7[2] = StpU1_F1(forIJKL.x * prjCurABEF.x + forQRST.x * prjCurABEF.z); 1826 con7[3] = StpU1_F1(forIJKL.y * prjCurABEF.x + forQRST.y * prjCurABEF.z); 1827 // k89AB 1828 con8[0] = StpU1_F1(forIJKL.z * prjCurABEF.x + forQRST.z * prjCurABEF.z); 1829 con8[1] = StpU1_F1(forIJKL.w * prjCurABEF.x + forQRST.w * prjCurABEF.z); 1830 con8[2] = StpU1_F1(forMNOP.x * prjCurABEF.y + forQRST.x * prjCurABEF.w); 1831 con8[3] = StpU1_F1(forMNOP.y * prjCurABEF.y + forQRST.y * prjCurABEF.w); 1832 // kCDEF 1833 con9[0] = StpU1_F1(forMNOP.z * prjCurABEF.y + forQRST.z * prjCurABEF.w); 1834 con9[1] = StpU1_F1(forMNOP.w * prjCurABEF.y + forQRST.w * prjCurABEF.w); 1835 con9[2] = StpU1_F1(forQRST.x * prjCurCDGH.z); 1836 con9[3] = StpU1_F1(forQRST.y * prjCurCDGH.z); 1837 // kGHIJ 1838 conA[0] = StpU1_F1(forQRST.z * prjCurCDGH.z); 1839 conA[1] = StpU1_F1(forQRST.w * prjCurCDGH.z + prjCurCDGH.w); 1840 conA[2] = StpU1_F1(bckIJKL.x * prjPrvABEF.x + bckQRST.x * prjPrvABEF.z); 1841 conA[3] = StpU1_F1(bckIJKL.y * prjPrvABEF.x + bckQRST.y * prjPrvABEF.z); 1842 // kKLMN 1843 conB[0] = StpU1_F1(bckIJKL.z * prjPrvABEF.x + bckQRST.z * prjPrvABEF.z); 1844 conB[1] = StpU1_F1(bckIJKL.w * prjPrvABEF.x + bckQRST.w * prjPrvABEF.z); 1845 conB[2] = StpU1_F1(bckMNOP.x * prjPrvABEF.y + bckQRST.x * prjPrvABEF.w); 1846 conB[3] = StpU1_F1(bckMNOP.y * prjPrvABEF.y + bckQRST.y * prjPrvABEF.w); 1847 // kOPQR 1848 conC[0] = StpU1_F1(bckMNOP.z * prjPrvABEF.y + bckQRST.z * prjPrvABEF.w); 1849 conC[1] = StpU1_F1(bckMNOP.w * prjPrvABEF.y + bckQRST.w * prjPrvABEF.w); 1850 conC[2] = StpU1_F1(bckQRST.x * prjPrvCDGH.z); 1851 conC[3] = StpU1_F1(bckQRST.y * prjPrvCDGH.z);} 1852//============================================================================================================================== 1853STP_STATIC void StpTaaCon( 1854// Generated constants. 1855StpInOutU4 con0, 1856StpInOutU4 con1, 1857StpInOutU4 con2, 1858StpInOutU4 con3, 1859// Amount of grain {0 = maximum, >0 is amount of stops less of grain}. 1860StpF1 grain, 1861// Frame count for current frame (sets jitter). 1862StpU1 frame, 1863// Current image resolution in pixels. 1864StpInF2 imgC, 1865// Feedback (aka output) resolution in pixels. 1866StpInF2 imgF) { 1867//------------------------------------------------------------------------------------------------------------------------------ 1868 // Grab jitter for current frame. 1869 StpVarF2 jitC; 1870 StpJit(jitC, frame); 1871//------------------------------------------------------------------------------------------------------------------------------ 1872 // Conversion from integer pix position to center pix float pixel position in image for current input. 1873 // xy := multiply term (M) --- Scale by 1/imgF to get to {0 to 1}. 1874 // zw := addition term (A) --- Add 0.5*M to get to center of pixel, then subtract jitC to undo jitter. 1875 // StpF2 kCRcpF. 1876 con0[0] = StpU1_F1(imgC[0] / imgF[0]); 1877 con0[1] = StpU1_F1(imgC[1] / imgF[1]); 1878 // StpF2 kHalfCRcpFUnjitC. 1879 con0[2] = StpU1_F1(StpF1_(0.5) * imgC[0] / imgF[0] - jitC[0]); 1880 con0[3] = StpU1_F1(StpF1_(0.5) * imgC[1] / imgF[1] - jitC[1]); 1881//------------------------------------------------------------------------------------------------------------------------------ 1882 // StpF2 kRcpC := 1/size of current input image in pixels. 1883 con1[0] = StpU1_F1(StpF1_(1.0) / imgC[0]); 1884 con1[1] = StpU1_F1(StpF1_(1.0) / imgC[1]); 1885//------------------------------------------------------------------------------------------------------------------------------ 1886 // StpF2 kRcpF := 1/size of feedback image (aka output) in pixels. 1887 con1[2] = StpU1_F1(StpF1_(1.0) / imgF[0]); 1888 con1[3] = StpU1_F1(StpF1_(1.0) / imgF[1]); 1889//------------------------------------------------------------------------------------------------------------------------------ 1890 // StpF2 kHalfRcpF := 0.5/size of feedback image (aka output) in pixels. 1891 con2[0] = StpU1_F1(StpF1_(0.5) / imgF[0]); 1892 con2[1] = StpU1_F1(StpF1_(0.5) / imgF[1]); 1893//------------------------------------------------------------------------------------------------------------------------------ 1894 // Conversion from a {0 to 1} position in current input to feedback. 1895 // StpH3 kJitCRcpC0 := jitC / image image size in pixels + {-0.5/size, +0.5/size} of current input image in pixels. 1896 con2[2] = StpU1_F1(jitC[0] / imgC[0] - StpF1_(0.5) / imgC[0]); 1897 con2[3] = StpU1_F1(jitC[1] / imgC[1] + StpF1_(0.5) / imgC[1]); 1898//------------------------------------------------------------------------------------------------------------------------------ 1899 // StpF2 kHalfRcpC := 0.5/size of current input image in pixels. 1900 con3[0] = StpU1_F1(StpF1_(0.5) / imgC[0]); 1901 con3[1] = StpU1_F1(StpF1_(0.5) / imgC[1]); 1902//------------------------------------------------------------------------------------------------------------------------------ 1903 // StpF2 kF := size of feedback image in pixels. 1904 con3[2] = StpU1_F1(imgF[0]); 1905 con3[3] = StpU1_F1(imgF[1]); } 1906//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1907//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1908//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1909//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1910//_____________________________________________________________.._______________________________________________________________ 1911//============================================================================================================================== 1912// 1913// PATTERN ENTRY POINT 1914// 1915//============================================================================================================================== 1916// See the packed 16-bit version for comments. 1917#if defined(STP_GPU) && defined(STP_32BIT) && defined(STP_PAT) 1918 void StpPat4x4MaxF8(StpMU1 i, inout StpF4 a, inout StpF4 b); 1919 void StpPat4x4SumF4(StpMU1 i, inout StpF4 a); 1920//------------------------------------------------------------------------------------------------------------------------------ 1921 StpMF1 StpPatPriConF(StpF2 p); 1922//------------------------------------------------------------------------------------------------------------------------------ 1923 StpF2 StpPatDatMotF(StpMU2 o); 1924 StpMF3 StpPatDatColF(StpMU2 o); 1925 StpF1 StpPatDatZF(StpMU2 o); 1926 StpF1 StpPatFixZF(StpF1 z); 1927 StpU1 StpPatDatRF(StpMU2 o); 1928 StpMF1 StpPatFixRF(StpU1 v); 1929//------------------------------------------------------------------------------------------------------------------------------ 1930 StpMF1 StpPatDitF(StpMU2 o); 1931//------------------------------------------------------------------------------------------------------------------------------ 1932 StpMF4 StpPatPriFedF(StpF2 p); 1933 StpMF4 StpPatPriFedR4F(StpF2 p); 1934 StpMF4 StpPatPriFedG4F(StpF2 p); 1935 StpMF4 StpPatPriFedB4F(StpF2 p); 1936//------------------------------------------------------------------------------------------------------------------------------ 1937 StpMF2 StpPatPriLumF(StpF2 p); 1938//------------------------------------------------------------------------------------------------------------------------------ 1939 StpU4 StpPatPriMot4F(StpF2 p); 1940 #if STP_MAX_MIN_UINT 1941 StpU1 StpPatPriMotMinF(StpF2 p); 1942 #endif // STP_MAX_MIN_UINT 1943 #if STP_OFFSETS 1944 StpU4 StpPatPriMot4OF(StpF2 p, StpI2 o); 1945 #if STP_MAX_MIN_UINT 1946 StpU1 StpPatPriMotMinOF(StpF2 p, StpI2 o); 1947 #endif // STP_MAX_MIN_UINT 1948 #endif // STP_OFFSETS 1949//------------------------------------------------------------------------------------------------------------------------------ 1950 void StpPatStMotF(StpMU2 p, StpU1 v); 1951 void StpPatStColF(StpMU2 p, StpMF4 v); 1952 void StpPatStLumF(StpMU2 p, StpMF2 v); 1953 void StpPatStCnvF(StpMU2 p, StpMF1 v); 1954//============================================================================================================================== 1955 void StpPatF( 1956 StpMU1 lane, 1957 StpMU2 pp, 1958 StpU4 con0, 1959 StpU4 con1, 1960 StpU4 con2, 1961 StpU4 con3, 1962 StpU4 con4, 1963 StpU4 con5, 1964 StpU4 con6, 1965 StpU4 con7, 1966 StpU4 con8, 1967 StpU4 con9, 1968 StpU4 conA, 1969 StpU4 conB, 1970 StpU4 conC, 1971 StpU4 conD) { 1972//------------------------------------------------------------------------------------------------------------------------------ 1973 StpMF4 rC; 1974 StpU1 rM; 1975 StpMF2 rL; 1976 StpMF1 rCnv; 1977//------------------------------------------------------------------------------------------------------------------------------ 1978 StpF2 kRcpC = StpF2_U2(con0.xy); 1979 StpF2 kHalfRcpC = StpF2_U2(con0.zw); 1980 StpF2 kJitCRcpCUnjitPRcpP = StpF2_U2(con1.xy); 1981 StpF2 kJitCRcpC = StpF2_U2(con1.zw); 1982 StpF2 kF = StpF2_U2(con2.xy); 1983 StpF4 kOS = StpF4_U4(con3); 1984 StpF2 kDepth = StpF2_U2(con2.zw); 1985 StpF2 kUnDepth = StpF2_U2(con4.xy); 1986 StpF1 kMotionMatch = StpF1_U1(con4.z); 1987 StpF2 kC = StpF2_U2(con5.xy); 1988 StpF4 k0123 = StpF4_U4(con6); 1989 StpF4 k4567 = StpF4_U4(con7); 1990 StpF4 k89AB = StpF4_U4(con8); 1991 StpF4 kCDEF = StpF4_U4(con9); 1992 StpF4 kGHIJ = StpF4_U4(conA); 1993 StpF4 kKLMN = StpF4_U4(conB); 1994 StpF4 kOPQR = StpF4_U4(conC); 1995 StpF2 kST = StpF2_U2(conD.xy); 1996//------------------------------------------------------------------------------------------------------------------------------ 1997 StpF2 m = StpPatDatMotF(pp); 1998 StpMF1 d = StpPatDitF(pp); 1999 StpF1 zPre = StpPatDatZF(pp); 2000 StpMF3 c = StpPatDatColF(pp); 2001//============================================================================================================================== 2002// DEPENDENT INLINE INPUT MOTION 2003//============================================================================================================================== 2004 StpF2 p = StpF2(pp) * kRcpC + kHalfRcpC; 2005//------------------------------------------------------------------------------------------------------------------------------ 2006 // Check the streaming bandwidth limit. 2007 #if STP_BUG_BW_SOL 2008 { StpMF2 lum2 = StpPatPriLumF(p); 2009 StpMF1 cnvPrev = StpPatPriConF(p); 2010 StpU4 mZVP4 = StpPatPriMot4F(p); 2011 StpU1 rPre = StpPatDatRF(p); 2012 StpMF3 f = StpPatPriFedF(p).rgb; 2013 StpF1 z = StpPatFixZF(zPre); 2014 StpMF1 r = StpPatFixRF(rPre); 2015 rC.rgb = StpMF3_(m.x) + StpMF3_(d.x) + c + StpMF3_(lum2.x) + StpMF3_(cnvPrev) + StpMF3(mZVP4.xyz) + f + StpMF3_(z+r); 2016 rC.a = StpMF1_(0.0); 2017 rL = rC.rg; 2018 rM = StpU1_(rC.r); 2019 rCnv = rC.r; 2020 StpPatStMotF(pp, rM); 2021 StpPatStLumF(pp, rL); 2022 StpPatStColF(pp, rC); 2023 StpPatStCnvF(pp, rCnv); 2024 return; } 2025 #endif // STP_BUG_BW_SOL 2026//------------------------------------------------------------------------------------------------------------------------------ 2027 StpF2 pM = (p - m); 2028 StpF2 pF = pM + kJitCRcpC; 2029 pM = pM + kJitCRcpCUnjitPRcpP; 2030//------------------------------------------------------------------------------------------------------------------------------ 2031 StpMF2 lum2 = StpPatPriLumF(pM); 2032//------------------------------------------------------------------------------------------------------------------------------ 2033 StpMF1 cnvPrev = StpPatPriConF(pM); 2034//------------------------------------------------------------------------------------------------------------------------------ 2035 #if (STP_SAFE_DILATE == 2) 2036 #if STP_MAX_MIN_UINT 2037 StpU4 mZVP4; 2038 #if STP_OFFSETS 2039 mZVP4.x = StpPatPriMotMinOF(pM, StpI2(-1, -1)); 2040 mZVP4.y = StpPatPriMotMinOF(pM, StpI2( 1, -1)); 2041 mZVP4.z = StpPatPriMotMinOF(pM, StpI2(-1, 1)); 2042 mZVP4.w = StpPatPriMotMinOF(pM, StpI2( 1, 1)); 2043 #else // STP_OFFSETS 2044 mZVP4.x = StpPatPriMotMinF(pM + StpF2(-kRcpC.x, -kRcpC.y)); 2045 mZVP4.y = StpPatPriMotMinF(pM + StpF2( kRcpC.x, -kRcpC.y)); 2046 mZVP4.z = StpPatPriMotMinF(pM + StpF2(-kRcpC.x, kRcpC.y)); 2047 mZVP4.w = StpPatPriMotMinF(pM + StpF2( kRcpC.x, kRcpC.y)); 2048 #endif // ST_OFFSETS 2049 #else // STP_MAX_MIN_UINT 2050 #if STP_OFFSETS 2051 StpU4 mZVP4_0 = StpPatPriMot4OF(pM, StpI2(-1, -1)); 2052 StpU4 mZVP4_1 = StpPatPriMot4OF(pM, StpI2( 1, -1)); 2053 StpU4 mZVP4_2 = StpPatPriMot4OF(pM, StpI2(-1, 1)); 2054 StpU4 mZVP4_3 = StpPatPriMot4OF(pM, StpI2( 1, 1)); 2055 #else // STP_OFFSETS 2056 StpU4 mZVP4_0 = StpPatPriMot4F(pM + StpF2(-kRcpC.x, -kRcpC.y)); 2057 StpU4 mZVP4_1 = StpPatPriMot4F(pM + StpF2( kRcpC.x, -kRcpC.y)); 2058 StpU4 mZVP4_2 = StpPatPriMot4F(pM + StpF2(-kRcpC.x, kRcpC.y)); 2059 StpU4 mZVP4_3 = StpPatPriMot4F(pM + StpF2( kRcpC.x, kRcpC.y)); 2060 #endif // STP_OFFSETS 2061 #endif // STP_MAX_MIN_UINT 2062 #else // (STP_SAFE_DILATE == 2) 2063 StpU1 mZVPN; 2064 StpU4 mZVP2a = StpPatPriMot4F(pM - kHalfRcpC); 2065 StpU4 mZVP2b = StpPatPriMot4F(pM + kHalfRcpC); 2066 #if STP_MAX_MIN_UINT 2067 mZVPN = StpPatPriMotMinF(pM); 2068 #else // STP_MAX_MIN_UINT 2069 StpU4 mZVP4 = StpPatPriMot4F(pM); 2070 #endif // STP_MAX_MIN_UINT 2071 #endif // (STP_SAFE_DILATE == 2) 2072//------------------------------------------------------------------------------------------------------------------------------ 2073 StpU1 rPre = StpPatDatRF(pp); 2074//------------------------------------------------------------------------------------------------------------------------------ 2075 StpMF4 f4R = StpPatPriFedR4F(pF); 2076 StpMF4 f4G = StpPatPriFedG4F(pF); 2077 StpMF4 f4B = StpPatPriFedB4F(pF); 2078 StpMF3 f = StpPatPriFedF(pF).rgb; 2079//============================================================================================================================== 2080// DEPENDENT ON DITHER AND INLINE INPUT PARAMETERS 2081//============================================================================================================================== 2082 StpF1 dd = StpF1_(d); 2083 StpF1 z = StpPatFixZF(zPre); 2084 z = StpZPack(z, kDepth, dd); 2085 rM = StpMvPack(z, m, dd); 2086 StpPatStMotF(pp, rM); 2087//------------------------------------------------------------------------------------------------------------------------------ 2088 #if STP_BUG 2089 // Pattern/Clipped Input Color 2090 { StpF4 bug = StpF4_(0.0); 2091 bug.rgb = sqrt(StpF3(c.rgb)); 2092 bug.rgb = StpSatF3(bug.rgb + StpF3_(StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0))); 2093 StpBugF(StpU3(pp, 0), bug); } 2094//------------------------------------------------------------------------------------------------------------------------------ 2095 // Pattern/Log Input Depth 2096 { StpF4 bug = StpF4_(0.0); 2097 bug.rgb = StpF3_(StpSatF1(z + StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0))); 2098 StpBugF(StpU3(pp, 1), bug); } 2099 #endif // STP_BUG 2100//------------------------------------------------------------------------------------------------------------------------------ 2101 #if (STP_POSTMAP == 0) 2102 StpToneMF3(c); 2103 #endif // (STP_POSTMAP == 0) 2104//------------------------------------------------------------------------------------------------------------------------------ 2105 #if STP_BUG 2106 // Pattern/Reversible Tonemapped Input Color 2107 { StpF4 bug = StpF4_(0.0); 2108 bug.rgb = sqrt(StpF3(c.rgb)); 2109 bug.rgb = StpSatF3(bug.rgb + StpF3_(StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0))); 2110 StpBugF(StpU3(pp, 2), bug); } 2111 #endif // STP_BUG 2112//------------------------------------------------------------------------------------------------------------------------------ 2113 c = sqrt(c); 2114 rC.rgb = StpSatMF3(c + StpMF3_(d * StpMF1(1.0 / 1023.0) + StpMF1(-0.5 / 1023.0))); 2115//------------------------------------------------------------------------------------------------------------------------------ 2116 rL.x = dot(c, StpMF3(STP_LUMA)); 2117 rL.y = lum2.x; 2118 StpPatStLumF(pp, rL); 2119//------------------------------------------------------------------------------------------------------------------------------ 2120 #if STP_BUG 2121 // Pattern/Shaped Absolute Input Motion 2122 { StpF4 bug = StpF4_(0.0); 2123 bug.b = sqrt(StpF1_(rL.x) * StpF1_(0.25)); 2124 bug.rg = StpF2_(1.0) - exp2(abs(StpF2(m)) * StpF2_(-32.0)); 2125 bug.rgb = StpSatF3(bug.rgb + StpF3_(StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0))); 2126 StpBugF(StpU3(pp, 3), bug); } 2127 #endif // STP_BUG 2128//------------------------------------------------------------------------------------------------------------------------------ 2129 StpMF1 moire = min(abs(rL.x - lum2.x), abs(lum2.x - lum2.y)); 2130 moire *= StpMF1_(STP_PAT_DEMOIRE); 2131//------------------------------------------------------------------------------------------------------------------------------ 2132 StpMF4 xnyRG = StpMF4(c.r, -c.r, c.g, -c.g); 2133 StpMF4 xnyBC = StpMF4(c.b, -c.b, -cnvPrev, -cnvPrev); 2134 #if defined(STP_16BIT) 2135 #else // defined(STP_16BIT) 2136 // We convert to full precision floats here since the reductions work on 32-bit values. 2137 StpF4 xnyRGF = StpF4(xnyRG); 2138 StpF4 xnyBCF = StpF4(xnyBC); 2139 StpPat4x4MaxF8(lane, xnyRGF, xnyBCF); 2140 xnyRG = StpMF4(xnyRGF); 2141 xnyBC = StpMF4(xnyBCF); 2142 #endif // defined(STP_16BIT) 2143 cnvPrev = -xnyBC.z; 2144 StpMF3 ne = max(StpMF3_(STP_PAT_NE_MIN) * StpMF3(xnyRG.x, xnyRG.z, xnyBC.x), 2145 StpMF3(xnyRG.x + xnyRG.y, xnyRG.z + xnyRG.w, xnyBC.x + xnyBC.y)); 2146 StpMF1 ne1 = dot(ne, StpMF3(STP_LUMA)); 2147//------------------------------------------------------------------------------------------------------------------------------ 2148 cnvPrev = StpSatMF1(cnvPrev + StpMF1_(1.0 / STP_FRAME_MAX)); 2149//------------------------------------------------------------------------------------------------------------------------------ 2150 StpF2 onXY = StpF2(pM.xy); 2151 onXY = onXY * kOS.xy + kOS.zw; 2152 StpF1 onS = StpSignedF1(max(abs(onXY.x), abs(onXY.y)) - StpF1_(1.0)); 2153//------------------------------------------------------------------------------------------------------------------------------ 2154 #if STP_BUG 2155 // Pattern/Motion Reprojection {R=Prior G=This Sqrt Luma Feedback Diff, B=Offscreen} 2156 { StpF4 bug = StpF4_(0.0); 2157 bug.g = StpF1_(abs(rL.x - lum2.x)); 2158 bug.r = StpF1_(abs(lum2.x - lum2.y)); 2159 bug.b = StpF1_(1.0) - StpF1_(onS); 2160 bug.rg = sqrt(bug.rg); 2161 bug.rgb = StpSatF3(bug.rgb + StpF3_(StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0))); 2162 StpBugF(StpU3(pp, 4), bug); } 2163 #endif // STP_BUG 2164//============================================================================================================================== 2165// DEPENDENT ON PRIOR {Z, MOTION} 2166//============================================================================================================================== 2167 #if (STP_SAFE_DILATE == 2) 2168 #if (STP_MAX_MIN_UINT == 0) 2169 StpU4 mZVP4 = min(StpMin3U4(mZVP4_0, mZVP4_1, mZVP4_2), mZVP4_3); 2170 #endif // (STP_MAX_MIN_UINT == 0) 2171 StpU1 mZVPN = min(StpMin3U1(mZVP4.x, mZVP4.y, mZVP4.z), mZVP4.w); 2172 #else // (STP_SAFE_DILATE == 2) 2173 #if (STP_MAX_MIN_UINT == 0) 2174 mZVPN = min(StpMin3U1(mZVP4.x, mZVP4.y, mZVP4.z), mZVP4.w); 2175 #endif // (STP_MAX_MIN_UINT == 0) 2176 #if STP_SAFE_DILATE 2177 mZVPN = StpMin3U1(StpMin3U1(mZVPN, mZVP2a.x, mZVP2a.z), mZVP2b.x, mZVP2b.z); 2178 #endif // STP_SAFE_DILATE 2179 #endif // (STP_SAFE_DILATE == 2) 2180//------------------------------------------------------------------------------------------------------------------------------ 2181 StpF2 mPN; 2182 StpF1 mZPN; 2183 StpMvUnpack(mZPN, mPN, mZVPN); 2184//------------------------------------------------------------------------------------------------------------------------------ 2185 StpF2 mE; 2186 mE = sqrt(abs(m)) + StpF2_(1.0 / 256.0); 2187 mE = mE * mE - abs(m); 2188//------------------------------------------------------------------------------------------------------------------------------ 2189 StpF1 sgZ = StpZUnpack(mZPN, kUnDepth); 2190 StpF2 bugF; StpF2 bugD; 2191 StpF2 sgM = StpFor(pM, sgZ, mPN, kMotionMatch, k0123, k4567, k89AB, kCDEF, kGHIJ, kKLMN, kOPQR, kST, bugF, bugD); 2192 sgM = StpSatF2(abs(sgM * StpF2_(0.5) - m) - mE) * kC; 2193 StpMF1 sgD = StpMF1(dot(sgM, sgM)); 2194//------------------------------------------------------------------------------------------------------------------------------ 2195 StpMF1 match = StpMF1_(1.0) - StpSatMF1(sgD * StpMF1_(STP_PAT_MOT_AMP) - StpMF1_(STP_PAT_MOT_ADD * STP_PAT_MOT_AMP)); 2196 match *= StpMF1_(onS); 2197 rC.a = match; 2198 StpPatStColF(pp, rC); 2199//------------------------------------------------------------------------------------------------------------------------------ 2200 moire = moire * match + StpMF1_(1.0 / 8192.0); 2201 moire = min(StpMF1_(1.0), ne1 * StpRcpMF1(moire)); 2202//------------------------------------------------------------------------------------------------------------------------------ 2203 StpMF1 tS = moire; 2204 StpMF1 r = StpPatFixRF(rPre); 2205 tS = tS * (StpMF1_(STP_PAT_RESPONSIVE) - r * StpMF1_(STP_PAT_RESPONSIVE)) + tS; 2206//------------------------------------------------------------------------------------------------------------------------------ 2207 #if STP_BUG 2208 // Pattern/Sensitivity {G=No motion match, R=Responsive, B=Luma} 2209 { StpF4 bug = StpF4_(0.0); 2210 bug.g = StpF1_(1.0) - StpF1(match); 2211 bug.r = StpF1_(1.0) - StpF1(r); 2212 bug.b = StpF1_(rL.x); 2213 bug.rgb = StpSatF3(bug.rgb + StpF3_(StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0))); 2214 StpBugF(StpU3(pp, 5), bug); } 2215 #endif // STP_BUG 2216//============================================================================================================================== 2217// DEPENDENT ON FEEDBACK 2218//============================================================================================================================== 2219 StpMF4 t; 2220 t.rgb = c - f; 2221 t.a = dot(abs(t.rgb), StpMF3(STP_LUMA)); 2222 StpMF4 t4R = f4R - StpMF4_(c.r); 2223 StpMF4 t4G = f4G - StpMF4_(c.g); 2224 StpMF4 t4B = f4B - StpMF4_(c.b); 2225 StpMF4 t4A = abs(t4R) * StpMF4_(STP_LUMA_R) + abs(t4G) * StpMF4_(STP_LUMA_G) + abs(t4B) * StpMF4_(STP_LUMA_B); 2226 t.a = StpMin3MF1(t.a, t4A.x, StpMin3MF1(t4A.y, t4A.z, t4A.w)); 2227 if(t.a == t4A.x) t.rgb = StpMF3(t4R.x, t4G.x, t4B.x); 2228 if(t.a == t4A.y) t.rgb = StpMF3(t4R.y, t4G.y, t4B.y); 2229 if(t.a == t4A.z) t.rgb = StpMF3(t4R.z, t4G.z, t4B.z); 2230 if(t.a == t4A.w) t.rgb = StpMF3(t4R.w, t4G.w, t4B.w); 2231//------------------------------------------------------------------------------------------------------------------------------ 2232 t.rgb *= StpMF3_(tS); 2233//------------------------------------------------------------------------------------------------------------------------------ 2234 #if defined(STP_16BIT) 2235 StpPat4x4SumH4(lane, t); 2236 #else // defined(STP_16BIT) 2237 // We convert to full precision floats here since the reductions work on 32-bit values, and MF might be 16-bit. 2238 StpF4 tF = StpF4(t); 2239 StpPat4x4SumF4(lane, tF); 2240 t = StpMF4(tF); 2241 #endif // defined(STP_16BIT) 2242 t.rgb *= StpMF3_(STP_PAT_SENSITIVITY); 2243//------------------------------------------------------------------------------------------------------------------------------ 2244 StpMF3 bln3 = StpSatMF3(ne * StpRcpMF3(abs(t.rgb))); 2245 StpMF1 bln = StpMin3MF1(bln3.r, bln3.g, bln3.b); 2246//------------------------------------------------------------------------------------------------------------------------------ 2247 StpMF1 cnv = StpSatMF1(bln * StpRcpMF1(StpMF1_(STP_FRAME_MAX) - StpMF1_(STP_FRAME_MAX) * bln)); 2248//------------------------------------------------------------------------------------------------------------------------------ 2249 cnv = StpSatMF1(cnv - StpMF1_(1.0 / STP_FRAME_MAX)); 2250 rCnv = min(cnv, cnvPrev); 2251 StpPatStCnvF(pp, rCnv); } 2252#endif // defined(STP_GPU) && defined(STP_32BIT) && defined(STP_PAT) 2253//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2254//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2255//_____________________________________________________________.._______________________________________________________________ 2256//============================================================================================================================== 2257// 16-BIT PATH 2258//============================================================================================================================== 2259// See the packed 16-bit version for comments. 2260#if defined(STP_GPU) && defined(STP_16BIT) && defined(STP_PAT) 2261 // 4x4 wave op: 8 component maximum. 2262 void StpPat4x4MaxH8(StpW1 i, inout StpH4 a, inout StpH4 b); 2263 // 4x4 wave op: 4 component sum. 2264 void StpPat4x4SumH4(StpW1 i, inout StpH4 a); 2265//------------------------------------------------------------------------------------------------------------------------------ 2266 // Sample bilinear interpolated clamp to edge prior convergence. 2267 StpH1 StpPatPriConH(StpF2 p); 2268//------------------------------------------------------------------------------------------------------------------------------ 2269 // Note this is still designed to be an inline function pass merged to avoid DRAM traffic. 2270 // So in an ideal world (with better merging with pre-scale post) these would be already in registers. 2271 // But when PAT pass is non-inline, these callbacks are placed in the right order for loads. 2272 // Input motion, 'position - motion' is the reprojected position, where {0 to 1} is range of the screen. 2273 StpF2 StpPatDatMotH(StpW2 o); 2274 // Input color, this is linear HDR or post-tonemap-linear depending on STP_POSTMAP. 2275 StpH3 StpPatDatColH(StpW2 o); 2276 StpF1 StpPatDatZH(StpW2 o); 2277 // Input depth, this is linear {0:near to INF:far} ranged. 2278 StpF1 StpPatFixZH(StpF1 z); 2279 StpU1 StpPatDatRH(StpW2 o); 2280 // Responsive input pixel {0.0 := responsive, 1.0 := normal}. 2281 StpH1 StpPatFixRH(StpU1 v); 2282//------------------------------------------------------------------------------------------------------------------------------ 2283 // Dither value {0 to 1} this should be input pixel frequency spatial temporal blue noise. 2284 StpH1 StpPatDitH(StpW2 o); 2285//------------------------------------------------------------------------------------------------------------------------------ 2286 // Sample bilinear interpolated clamp to edge prior feedback. 2287 StpH4 StpPatPriFedH(StpF2 p); 2288 // Gather4 versions. 2289 StpH4 StpPatPriFedR4H(StpF2 p); 2290 StpH4 StpPatPriFedG4H(StpF2 p); 2291 StpH4 StpPatPriFedB4H(StpF2 p); 2292//------------------------------------------------------------------------------------------------------------------------------ 2293 // Sample bilinear interpolated clamp to edge 2-frame luma ring. 2294 StpH2 StpPatPriLumH(StpF2 p); 2295//------------------------------------------------------------------------------------------------------------------------------ 2296 // Gather4 on prior {z,motion}. 2297 StpU4 StpPatPriMot4H(StpF2 p); 2298 #if STP_MAX_MIN_UINT 2299 StpU1 StpPatPriMotMinH(StpF2 p); 2300 #endif // STP_MAX_MIN_UINT 2301 #if STP_OFFSETS 2302 StpU4 StpPatPriMot4OH(StpF2 p, StpI2 o); 2303 #if STP_MAX_MIN_UINT 2304 StpU1 StpPatPriMotMinOH(StpF2 p, StpI2 o); 2305 #endif // STP_MAX_MIN_UINT 2306 #endif // STP_OFFSETS 2307//------------------------------------------------------------------------------------------------------------------------------ 2308 void StpPatStMotH(StpW2 p, StpU1 v); 2309 void StpPatStColH(StpW2 p, StpH4 v); 2310 void StpPatStLumH(StpW2 p, StpH2 v); 2311 void StpPatStCnvH(StpW2 p, StpH1 v); 2312//============================================================================================================================== 2313 void StpPatH( 2314 StpW1 lane, 2315 StpW2 pp, 2316 StpU4 con0, 2317 StpU4 con1, 2318 StpU4 con2, 2319 StpU4 con3, 2320 StpU4 con4, 2321 StpU4 con5, 2322 StpU4 con6, 2323 StpU4 con7, 2324 StpU4 con8, 2325 StpU4 con9, 2326 StpU4 conA, 2327 StpU4 conB, 2328 StpU4 conC, 2329 StpU4 conD) { 2330//------------------------------------------------------------------------------------------------------------------------------ 2331 // Outputs. 2332 StpH4 rC; 2333 StpU1 rM; 2334 StpH2 rL; 2335 StpH1 rCnv; 2336//------------------------------------------------------------------------------------------------------------------------------ 2337 // Rename constants. 2338 StpF2 kRcpC = StpF2_U2(con0.xy); 2339 StpF2 kHalfRcpC = StpF2_U2(con0.zw); 2340 StpF2 kJitCRcpCUnjitPRcpP = StpF2_U2(con1.xy); 2341 StpF2 kJitCRcpC = StpF2_U2(con1.zw); 2342 StpF2 kF = StpF2_U2(con2.xy); 2343 StpF4 kOS = StpF4_U4(con3); 2344 StpF2 kDepth = StpF2_U2(con2.zw); 2345 StpF2 kUnDepth = StpF2_U2(con4.xy); 2346 StpF1 kMotionMatch = StpF1_U1(con4.z); 2347 StpF2 kC = StpF2_U2(con5.xy); 2348 StpF4 k0123 = StpF4_U4(con6); 2349 StpF4 k4567 = StpF4_U4(con7); 2350 StpF4 k89AB = StpF4_U4(con8); 2351 StpF4 kCDEF = StpF4_U4(con9); 2352 StpF4 kGHIJ = StpF4_U4(conA); 2353 StpF4 kKLMN = StpF4_U4(conB); 2354 StpF4 kOPQR = StpF4_U4(conC); 2355 StpF2 kST = StpF2_U2(conD.xy); 2356//------------------------------------------------------------------------------------------------------------------------------ 2357 StpF2 m = StpPatDatMotH(pp); 2358 // This dither fetch should likely be shared with pass merged pre-scale post work in the future. 2359 StpH1 d = StpPatDitH(pp); 2360 StpF1 zPre = StpPatDatZH(pp); 2361 StpH3 c = StpPatDatColH(pp); 2362//============================================================================================================================== 2363// DEPENDENT INLINE INPUT MOTION 2364//============================================================================================================================== 2365 // Work towards getting all dependent fetches out first. 2366 // Compute float position {0 to 1} across screen. 2367 StpF2 p = StpF2(pp) * kRcpC + kHalfRcpC; 2368//------------------------------------------------------------------------------------------------------------------------------ 2369 #if STP_BUG_BW_SOL 2370 { StpH2 lum2 = StpPatPriLumH(p); 2371 StpH1 cnvPrev = StpPatPriConH(p); 2372 StpU4 mZVP4 = StpPatPriMot4H(p); 2373 StpU1 rPre = StpPatDatRH(p); 2374 StpH3 f = StpPatPriFedH(p).rgb; 2375 StpF1 z = StpPatFixZH(zPre); 2376 StpH1 r = StpPatFixRH(rPre); 2377 rC.rgb = StpH3_(m.x) + StpH3_(d.x) + c + StpH3_(lum2.x) + StpH3_(cnvPrev) + StpH3(mZVP4.xyz) + f + StpH3_(z+r); 2378 rC.a = StpH1_(0.0); 2379 rL = rC.rg; 2380 rM = StpU1_(rC.r); 2381 rCnv = rC.r; 2382 StpPatStMotH(pp, rM); 2383 StpPatStLumH(pp, rL); 2384 StpPatStColH(pp, rC); 2385 StpPatStCnvH(pp, rCnv); 2386 return; } 2387 #endif // STP_BUG_BW_SOL 2388//------------------------------------------------------------------------------------------------------------------------------ 2389 // Reprojection position in prior input and feedback. 2390 StpF2 pM = (p - m); 2391 StpF2 pF = pM + kJitCRcpC; 2392 pM = pM + kJitCRcpCUnjitPRcpP; 2393//------------------------------------------------------------------------------------------------------------------------------ 2394 // Fetch 2-frame reprojected history ring of luma. 2395 StpH2 lum2 = StpPatPriLumH(pM); 2396//------------------------------------------------------------------------------------------------------------------------------ 2397 // Fetch reprojected low-frequency convergence prior frame. 2398 StpH1 cnvPrev = StpPatPriConH(pM); 2399//------------------------------------------------------------------------------------------------------------------------------ 2400 // Grab large enough neighborhood for prior reprojected nearest {z,motion}. 2401 // This nearest dilates {z, motion} reprojection to avoid pulling in anti-aliased edges and leaving temporal ringing. 2402 #if (STP_SAFE_DILATE == 2) 2403 #if STP_MAX_MIN_UINT 2404 StpU4 mZVP4; 2405 #if STP_OFFSETS 2406 mZVP4.x = StpPatPriMotMinOH(pM, StpI2(-1, -1)); 2407 mZVP4.y = StpPatPriMotMinOH(pM, StpI2( 1, -1)); 2408 mZVP4.z = StpPatPriMotMinOH(pM, StpI2(-1, 1)); 2409 mZVP4.w = StpPatPriMotMinOH(pM, StpI2( 1, 1)); 2410 #else // STP_OFFSETS 2411 mZVP4.x = StpPatPriMotMinH(pM + StpF2(-kRcpC.x, -kRcpC.y)); 2412 mZVP4.y = StpPatPriMotMinH(pM + StpF2( kRcpC.x, -kRcpC.y)); 2413 mZVP4.z = StpPatPriMotMinH(pM + StpF2(-kRcpC.x, kRcpC.y)); 2414 mZVP4.w = StpPatPriMotMinH(pM + StpF2( kRcpC.x, kRcpC.y)); 2415 #endif // ST_OFFSETS 2416 #else // STP_MAX_MIN_UINT 2417 #if STP_OFFSETS 2418 StpU4 mZVP4_0 = StpPatPriMot4OH(pM, StpI2(-1, -1)); 2419 StpU4 mZVP4_1 = StpPatPriMot4OH(pM, StpI2( 1, -1)); 2420 StpU4 mZVP4_2 = StpPatPriMot4OH(pM, StpI2(-1, 1)); 2421 StpU4 mZVP4_3 = StpPatPriMot4OH(pM, StpI2( 1, 1)); 2422 #else // STP_OFFSETS 2423 StpU4 mZVP4_0 = StpPatPriMot4H(pM + StpF2(-kRcpC.x, -kRcpC.y)); 2424 StpU4 mZVP4_1 = StpPatPriMot4H(pM + StpF2( kRcpC.x, -kRcpC.y)); 2425 StpU4 mZVP4_2 = StpPatPriMot4H(pM + StpF2(-kRcpC.x, kRcpC.y)); 2426 StpU4 mZVP4_3 = StpPatPriMot4H(pM + StpF2( kRcpC.x, kRcpC.y)); 2427 #endif // STP_OFFSETS 2428 #endif // STP_MAX_MIN_UINT 2429 #else // (STP_SAFE_DILATE == 2) 2430 StpU1 mZVPN; 2431 // To be correct here this needs 'kHalfRcpP' (prior instead of current). 2432 // But didn't want to pass yet another pair of constants, so using current instead. 2433 // TODO: If later moving to 'kHalfRcpP' can use one sample by offset to save some VALU ops. 2434 // Also this is only used if STP_SAFE_DILATE=1 (else dead code). 2435 StpU4 mZVP2a = StpPatPriMot4H(pM - kHalfRcpC); 2436 StpU4 mZVP2b = StpPatPriMot4H(pM + kHalfRcpC); 2437 #if STP_MAX_MIN_UINT 2438 mZVPN = StpPatPriMotMinH(pM); 2439 #else // STP_MAX_MIN_UINT 2440 StpU4 mZVP4 = StpPatPriMot4H(pM); 2441 #endif // STP_MAX_MIN_UINT 2442 #endif // (STP_SAFE_DILATE == 2) 2443//------------------------------------------------------------------------------------------------------------------------------ 2444 StpU1 rPre = StpPatDatRH(pp); 2445//------------------------------------------------------------------------------------------------------------------------------ 2446 // Gather 4 on feedback. 2447 StpH4 f4R = StpPatPriFedR4H(pF); 2448 StpH4 f4G = StpPatPriFedG4H(pF); 2449 StpH4 f4B = StpPatPriFedB4H(pF); 2450 // Grab bilinear feedback. 2451 StpH3 f = StpPatPriFedH(pF).rgb; 2452//============================================================================================================================== 2453// DEPENDENT ON DITHER AND INLINE INPUT PARAMETERS 2454//============================================================================================================================== 2455 StpF1 dd = StpF1_(d); 2456 // Convert depth {0 to inf} to {0 to 1} safe for 10-bit value. 2457 StpF1 z = StpPatFixZH(zPre); 2458 z = StpZPack(z, kDepth, dd); 2459 // Pack {MSB depth, LSB 11-bit XY motion}. 2460 rM = StpMvPack(z, m, dd); 2461 StpPatStMotH(pp, rM); 2462//------------------------------------------------------------------------------------------------------------------------------ 2463 #if STP_BUG 2464 // Pattern/Clipped Input Color 2465 { StpF4 bug = StpF4_(0.0); 2466 bug.rgb = sqrt(StpF3(c)); 2467 bug.rgb = StpSatF3(bug.rgb + StpF3_(StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0))); 2468 StpBugF(StpU3(pp, 0), bug); } 2469//------------------------------------------------------------------------------------------------------------------------------ 2470 // Pattern/Log Input Depth 2471 { StpF4 bug = StpF4_(0.0); 2472 bug.rgb = StpF3_(StpSatF1(z + StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0))); 2473 StpBugF(StpU3(pp, 1), bug); } 2474 #endif // STP_BUG 2475//------------------------------------------------------------------------------------------------------------------------------ 2476 // Pre-process color. 2477 // If running pre-tonemap, then do a fast reversible tonemapper (convert from {0 to inf} to {0 to 1}). 2478 #if (STP_POSTMAP == 0) 2479 StpToneH3(c); 2480 #endif // (STP_POSTMAP == 0) 2481//------------------------------------------------------------------------------------------------------------------------------ 2482 #if STP_BUG 2483 // Pattern/Reversible Tonemapped Input Color 2484 { StpF4 bug = StpF4_(0.0); 2485 bug.rgb = sqrt(StpF3(c)); 2486 bug.rgb = StpSatF3(bug.rgb + StpF3_(StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0))); 2487 StpBugF(StpU3(pp, 2), bug); } 2488 #endif // STP_BUG 2489//------------------------------------------------------------------------------------------------------------------------------ 2490 // Output intermediate color. 2491 // Dither from linear to gamma 2.0. 2492 // Simple non-energy conserving dither is working, using 10-bit/channel. 2493 c = sqrt(c); 2494 rC.rgb = StpSatH3(c + StpH3_(d * StpH1(1.0 / 1023.0) + StpH1(-0.5 / 1023.0))); 2495//------------------------------------------------------------------------------------------------------------------------------ 2496 // Setup the new 3-ring output luma. 2497 rL.x = dot(c, StpH3(STP_LUMA)); 2498 rL.y = lum2.x; 2499 StpPatStLumH(pp, rL); 2500//------------------------------------------------------------------------------------------------------------------------------ 2501 #if STP_BUG 2502 // Pattern/Shaped Absolute Input Motion 2503 { StpF4 bug = StpF4_(0.0); 2504 bug.b = sqrt(StpF1_(rL.x) * StpF1_(0.25)); 2505 bug.rg = StpF2_(1.0) - exp2(abs(StpF2(m)) * StpF2_(-32.0)); 2506 bug.rgb = StpSatF3(bug.rgb + StpF3_(StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0))); 2507 StpBugF(StpU3(pp, 3), bug); } 2508 #endif // STP_BUG 2509//------------------------------------------------------------------------------------------------------------------------------ 2510 // Minimum change across the 3 frames {current, 2-frame reprojected history}. 2511 StpH1 moire = min(abs(rL.x - lum2.x), abs(lum2.x - lum2.y)); 2512 moire *= StpH1_(STP_PAT_DEMOIRE); 2513//------------------------------------------------------------------------------------------------------------------------------ 2514 // Grab neighborhood. 2515 // Parallel block {max,-min}, and -min of convergence. 2516 StpH4 xnyRG = StpH4(c.r, -c.r, c.g, -c.g); 2517 StpH4 xnyBC = StpH4(c.b, -c.b, -cnvPrev, -cnvPrev); 2518 #if defined(STP_16BIT) 2519 StpPat4x4MaxH8(lane, xnyRG, xnyBC); 2520 #else // defined(STP_16BIT) 2521 // We convert to full precision floats here since the reductions work on 32-bit values. 2522 StpF4 xnyRGF = StpF4_(xnyRG); 2523 StpF4 xnyBCF = StpF4_(xnyBC); 2524 StpPat4x4MaxF8(lane, xnyRGF, xnyBCF); 2525 xnyRG = StpMF4_(xnyRGF); 2526 xnyBC = StpMF4_(xnyBCF); 2527 #endif // defined(STP_16BIT) 2528 cnvPrev = -xnyBC.z; 2529 // This is max minus min (the '.y' is already negative). 2530 StpH3 ne = max(StpH3_(STP_PAT_NE_MIN) * StpH3(xnyRG.x, xnyRG.z, xnyBC.x), 2531 StpH3(xnyRG.x + xnyRG.y, xnyRG.z + xnyRG.w, xnyBC.x + xnyBC.y)); 2532 StpH1 ne1 = dot(ne, StpH3(STP_LUMA)); 2533//------------------------------------------------------------------------------------------------------------------------------ 2534 // Advance low frequency convergence. 2535 cnvPrev = StpSatH1(cnvPrev + StpH1_(1.0 / STP_FRAME_MAX)); 2536//------------------------------------------------------------------------------------------------------------------------------ 2537 // Estimate if reprojection is on-screen. 2538 StpF2 onXY = StpF2(pM.xy); 2539 // {-1 to 1} is on screen. 2540 onXY = onXY * kOS.xy + kOS.zw; 2541 // {0 := offscreen, 1 := onscreen}. 2542 StpF1 onS = StpSignedF1(max(abs(onXY.x), abs(onXY.y)) - StpF1_(1.0)); 2543//------------------------------------------------------------------------------------------------------------------------------ 2544 #if STP_BUG 2545 // Pattern/Motion Reprojection {R=Prior G=This Sqrt Luma Feedback Diff, B=Offscreen} 2546 { StpF4 bug = StpF4_(0.0); 2547 bug.g = StpF1_(abs(rL.x - lum2.x)); 2548 bug.r = StpF1_(abs(lum2.x - lum2.y)); 2549 bug.b = StpF1_(1.0) - StpF1_(onS); 2550 bug.rg = sqrt(bug.rg); 2551 bug.rgb = StpSatF3(bug.rgb + StpF3_(StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0))); 2552 StpBugF(StpU3(pp, 4), bug); } 2553 #endif // STP_BUG 2554//============================================================================================================================== 2555// DEPENDENT ON PRIOR {Z, MOTION} 2556//============================================================================================================================== 2557 // Compute a motion match value. 2558 // Finish {z, motion} nearest dilation. 2559 #if (STP_SAFE_DILATE == 2) 2560 #if (STP_MAX_MIN_UINT == 0) 2561 StpU4 mZVP4 = min(StpMin3U4(mZVP4_0, mZVP4_1, mZVP4_2), mZVP4_3); 2562 #endif // (STP_MAX_MIN_UINT == 0) 2563 StpU1 mZVPN = min(StpMin3U1(mZVP4.x, mZVP4.y, mZVP4.z), mZVP4.w); 2564 #else // (STP_SAFE_DILATE == 2) 2565 #if (STP_MAX_MIN_UINT == 0) 2566 mZVPN = min(StpMin3U1(mZVP4.x, mZVP4.y, mZVP4.z), mZVP4.w); 2567 #endif // (STP_MAX_MIN_UINT == 0) 2568 #if STP_SAFE_DILATE 2569 mZVPN = StpMin3U1(StpMin3U1(mZVPN, mZVP2a.x, mZVP2a.z), mZVP2b.x, mZVP2b.z); 2570 #endif // STP_SAFE_DILATE 2571 #endif // (STP_SAFE_DILATE == 2) 2572//------------------------------------------------------------------------------------------------------------------------------ 2573 // The {motion} matching logic. 2574 StpF2 mPN; 2575 StpF1 mZPN; 2576 // Motion 'm' units are {1 := move by one screen}. 2577 StpMvUnpack(mZPN, mPN, mZVPN); 2578//------------------------------------------------------------------------------------------------------------------------------ 2579 StpF2 mE; 2580 // Use a smoother error estimate. 2581 // This '1/256' instead of '1/1024' is to be more accepting of a motion match. 2582 // The 'sqrt()' cannot be the low precision approximation without visually seeing differences in the mask. 2583 mE = sqrt(abs(m)) + StpF2_(1.0 / 256.0); 2584 mE = mE * mE - abs(m); 2585//------------------------------------------------------------------------------------------------------------------------------ 2586 // Static geometry motion + estimated dynamic motion matching logic. 2587 // Take unpacked low precision {0 to 1} Z and decode to {0 to INF}. 2588 StpF1 sgZ = StpZUnpack(mZPN, kUnDepth); 2589 StpF2 bugF; StpF2 bugD; 2590 StpF2 sgM = StpFor(pM, sgZ, mPN, kMotionMatch, k0123, k4567, k89AB, kCDEF, kGHIJ, kKLMN, kOPQR, kST, bugF, bugD); 2591 // Note 'sgM' is in NDC {-1 to 1} space and 'm' is in {0 to 1} space, thus the 0.5 scaling factor. 2592 // The difference gets conservative possible motion encoding error subtracted out via 'saturate(abs(..)-mE)'. 2593 sgM = StpSatF2(abs(sgM * StpF2_(0.5) - m) - mE) * kC; 2594 StpH1 sgD = StpH1(dot(sgM, sgM)); 2595//------------------------------------------------------------------------------------------------------------------------------ 2596 // Motion match {0 := no match, 1 := match}. 2597 StpH1 match = StpH1_(1.0) - StpSatH1(sgD * StpH1_(STP_PAT_MOT_AMP) - StpH1_(STP_PAT_MOT_ADD * STP_PAT_MOT_AMP)); 2598 // Offscreen is a non-match. 2599 match *= StpH1_(onS); 2600 // Pass motion match in alpha. 2601 rC.a = match; 2602 StpPatStColH(pp, rC); 2603//------------------------------------------------------------------------------------------------------------------------------ 2604 // Must disable on non-motion match, but make sure it doesn't fully /0 later. 2605 moire = moire * match + StpH1_(1.0 / 8192.0); 2606 // Scale down temporal change proportional to ratio of local neighborhood and minimum 3-frame temporal change. 2607 moire = min(StpH1_(1.0), ne1 * StpRcpH1(moire)); 2608//------------------------------------------------------------------------------------------------------------------------------ 2609 // Sensitivity modifiers. 2610 // The following which gets optimized to two FMAs. 2611 // tS = tS * ((1-v)*k + 1) ... logic 2612 // tS = tS * ((1-v)*k) + tS 2613 // tS = tS * (k-v*k) + tS ..... optimized 2614 StpH1 tS = moire; 2615 StpH1 r = StpPatFixRH(rPre); 2616 tS = tS * (StpH1_(STP_PAT_RESPONSIVE) - r * StpH1_(STP_PAT_RESPONSIVE)) + tS; 2617//------------------------------------------------------------------------------------------------------------------------------ 2618 #if STP_BUG 2619 // Pattern/Sensitivity {G=No motion match, R=Responsive, B=Luma} 2620 { StpF4 bug = StpF4_(0.0); 2621 bug.g = StpF1_(1.0) - StpF1(match); 2622 bug.r = StpF1_(1.0) - StpF1(r); 2623 bug.b = StpF1_(rL.x); 2624 bug.rgb = StpSatF3(bug.rgb + StpF3_(StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0))); 2625 StpBugF(StpU3(pp, 5), bug); } 2626 #endif // STP_BUG 2627//============================================================================================================================== 2628// DEPENDENT ON FEEDBACK 2629//============================================================================================================================== 2630 // Find lowest temporal difference. 2631 StpH4 t; 2632 t.rgb = c - f; 2633 // Luma diff in alpha. 2634 t.a = dot(abs(t.rgb), StpH3(STP_LUMA)); 2635 // Compute lowest difference for all in quad. 2636 StpH4 t4R = f4R - StpH4_(c.r); 2637 StpH4 t4G = f4G - StpH4_(c.g); 2638 StpH4 t4B = f4B - StpH4_(c.b); 2639 StpH4 t4A = abs(t4R) * StpH4_(STP_LUMA_R) + abs(t4G) * StpH4_(STP_LUMA_G) + abs(t4B) * StpH4_(STP_LUMA_B); 2640 // Override with lower from gather4. 2641 t.a = StpMin3H1(t.a, t4A.x, StpMin3H1(t4A.y, t4A.z, t4A.w)); 2642 if(t.a == t4A.x) t.rgb = StpH3(t4R.x, t4G.x, t4B.x); 2643 if(t.a == t4A.y) t.rgb = StpH3(t4R.y, t4G.y, t4B.y); 2644 if(t.a == t4A.z) t.rgb = StpH3(t4R.z, t4G.z, t4B.z); 2645 if(t.a == t4A.w) t.rgb = StpH3(t4R.w, t4G.w, t4B.w); 2646//------------------------------------------------------------------------------------------------------------------------------ 2647 // Factor in sensitivity and reduce. 2648 t.rgb *= StpH3_(tS); 2649//------------------------------------------------------------------------------------------------------------------------------ 2650 #if defined(STP_16BIT) 2651 StpPat4x4SumH4(lane, t); 2652 #else // defined(STP_16BIT) 2653 // We convert to full precision floats here since the reductions work on 32-bit values, and MF might be 16-bit. 2654 StpF4 tF = StpF4(t); 2655 StpPat4x4SumF4(lane, tF); 2656 t = StpMF4(tF); 2657 #endif // defined(STP_16BIT) 2658 t.rgb *= StpH3_(STP_PAT_SENSITIVITY); 2659//------------------------------------------------------------------------------------------------------------------------------ 2660 // Ratio of 'spatial/temporal' change. 2661 StpH3 bln3 = StpSatH3(ne * StpPrxLoRcpH3(abs(t.rgb))); 2662 // Worst channel limits to avoid chroma ghosting. 2663 StpH1 bln = StpMin3H1(bln3.r, bln3.g, bln3.b); 2664//------------------------------------------------------------------------------------------------------------------------------ 2665 // Convert from blend ratio to convergence. 2666 // Note, 'rcp(0)=+INF' when approximations are not used. 2667 StpH1 cnv = StpSatH1(bln * StpPrxLoRcpH1(StpH1_(STP_FRAME_MAX) - StpH1_(STP_FRAME_MAX) * bln)); 2668//------------------------------------------------------------------------------------------------------------------------------ 2669 // Feedback the min of reprojected convergence, and subtract one frame (as next frame advances by one). 2670 cnv = StpSatH1(cnv - StpH1_(1.0 / STP_FRAME_MAX)); 2671 rCnv = min(cnv, cnvPrev); 2672 StpPatStCnvH(pp, rCnv); } 2673#endif // defined(STP_GPU) && defined(STP_16BIT) && defined(STP_PAT) 2674//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2675//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2676//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2677//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2678//_____________________________________________________________.._______________________________________________________________ 2679//============================================================================================================================== 2680// 2681// PATTERN DILATION ENTRY POINT 2682// 2683//------------------------------------------------------------------------------------------------------------------------------ 2684// This should be pass merged with STP_SAA. 2685// Dilates low frequency convergence. 2686//============================================================================================================================== 2687#if defined(STP_GPU) && defined(STP_32BIT) && defined(STP_DIL) 2688 StpMF1 StpDilDitF(StpMU2 o); 2689 StpMF1 StpDilConF(StpF2 p); 2690 StpMF4 StpDilCon4F(StpF2 p); 2691 #if STP_OFFSETS 2692 StpMF1 StpDilConOF(StpF2 p, StpI2 o); 2693 StpMF4 StpDilCon4OF(StpF2 p, StpI2 o); 2694 #endif // STP_OFFSETS 2695//============================================================================================================================== 2696 void StpDilF(out StpMF1 oC, StpU2 pp, StpU4 con0) { 2697 StpF2 kRcpR = StpF2_U2(con0.xy); 2698//------------------------------------------------------------------------------------------------------------------------------ 2699 StpF2 p = StpF2(pp) * kRcpR; 2700//------------------------------------------------------------------------------------------------------------------------------ 2701 #if STP_BUG_BW_SOL 2702 { oC = StpDilCon4F(p).x; return; } 2703 #endif // STP_BUG_BW_SOL 2704//------------------------------------------------------------------------------------------------------------------------------ 2705 #if STP_OFFSETS 2706 StpMF4 g0 = StpDilCon4OF(p, StpI2(-1.0, -1.0)); 2707 StpMF4 g1 = StpDilCon4OF(p, StpI2( 1.0, -1.0)); 2708 StpMF4 g2 = StpDilCon4OF(p, StpI2( 3.0, -1.0)); 2709 StpMF4 g3 = StpDilCon4OF(p, StpI2(-1.0, 1.0)); 2710 StpMF4 g4 = StpDilCon4OF(p, StpI2( 1.0, 1.0)); 2711 StpMF4 g5 = StpDilCon4OF(p, StpI2( 3.0, 1.0)); 2712 StpMF4 g6 = StpDilCon4OF(p, StpI2(-1.0, 3.0)); 2713 StpMF4 g7 = StpDilCon4OF(p, StpI2( 1.0, 3.0)); 2714 StpMF4 g8 = StpDilCon4OF(p, StpI2( 3.0, 3.0)); 2715 #else // STP_OFFSETS 2716 StpMF4 g0 = StpDilCon4F(p + StpF2(-1.0 * kRcpR.x, -1.0 * kRcpR.y)); 2717 StpMF4 g1 = StpDilCon4F(p + StpF2( 1.0 * kRcpR.x, -1.0 * kRcpR.y)); 2718 StpMF4 g2 = StpDilCon4F(p + StpF2( 3.0 * kRcpR.x, -1.0 * kRcpR.y)); 2719 StpMF4 g3 = StpDilCon4F(p + StpF2(-1.0 * kRcpR.x, 1.0 * kRcpR.y)); 2720 StpMF4 g4 = StpDilCon4F(p + StpF2( 1.0 * kRcpR.x, 1.0 * kRcpR.y)); 2721 StpMF4 g5 = StpDilCon4F(p + StpF2( 3.0 * kRcpR.x, 1.0 * kRcpR.y)); 2722 StpMF4 g6 = StpDilCon4F(p + StpF2(-1.0 * kRcpR.x, 3.0 * kRcpR.y)); 2723 StpMF4 g7 = StpDilCon4F(p + StpF2( 1.0 * kRcpR.x, 3.0 * kRcpR.y)); 2724 StpMF4 g8 = StpDilCon4F(p + StpF2( 3.0 * kRcpR.x, 3.0 * kRcpR.y)); 2725 #endif // STP_OFFSETS 2726//------------------------------------------------------------------------------------------------------------------------------ 2727 StpMF1 cA = g0.w; 2728 StpMF1 cB = g0.z; 2729 StpMF1 cC = g1.w; 2730 StpMF1 cD = g1.z; 2731 StpMF1 cE = g2.w; 2732 StpMF1 cF = g0.x; 2733 StpMF1 cG = g0.y; 2734 StpMF1 cH = g1.x; 2735 StpMF1 cI = g1.y; 2736 StpMF1 cJ = g2.x; 2737 StpMF1 cK = g3.w; 2738 StpMF1 cL = g3.z; 2739 StpMF1 cM = g4.w; 2740 StpMF1 cN = g4.z; 2741 StpMF1 cO = g5.w; 2742 StpMF1 cP = g3.x; 2743 StpMF1 cQ = g3.y; 2744 StpMF1 cR = g4.x; 2745 StpMF1 cS = g4.y; 2746 StpMF1 cT = g5.x; 2747 StpMF1 cU = g6.w; 2748 StpMF1 cV = g6.z; 2749 StpMF1 cW = g7.w; 2750 StpMF1 cX = g7.z; 2751 StpMF1 cY = g8.w; 2752//------------------------------------------------------------------------------------------------------------------------------ 2753 StpMF4 m1345; 2754 m1345.x = StpMin3MF1(StpMin3MF1(cG, cH, cI), cC, cM); 2755 m1345.y = StpMin3MF1(StpMin3MF1(cK, cL, cM), cG, cQ); 2756 m1345.z = StpMin3MF1(StpMin3MF1(cL, cM, cN), cH, cR); 2757 m1345.w = StpMin3MF1(StpMin3MF1(cM, cN, cO), cI, cS); 2758 StpMF1 m7 = StpMin3MF1(StpMin3MF1(cQ, cR, cS), cM, cW); 2759//------------------------------------------------------------------------------------------------------------------------------ 2760 StpMF1 b0 = StpMF1_(0.5); 2761 StpMF1 b1 = (StpMF1_(1.0) - b0) * StpMF1_(0.25); 2762 oC = m1345.z * b0 + m1345.x * b1 + m1345.y * b1 + m1345.w * b1 + m7 * b1; } 2763#endif // defined(STP_GPU) && defined(STP_32BIT) && defined(STP_DIL) 2764//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2765//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2766//_____________________________________________________________.._______________________________________________________________ 2767//============================================================================================================================== 2768// 16-BIT PATH 2769//============================================================================================================================== 2770#if defined(STP_GPU) && defined(STP_16BIT) && defined(STP_DIL) 2771 // Some of these are unused, possibly for future experimentation. 2772 StpH1 StpDilDitH(StpW2 o); 2773 StpH1 StpDilConH(StpF2 p); 2774 StpH4 StpDilCon4H(StpF2 p); 2775 #if STP_OFFSETS 2776 StpH1 StpDilConOH(StpF2 p, StpI2 o); 2777 StpH4 StpDilCon4OH(StpF2 p, StpI2 o); 2778 #endif // STP_OFFSETS 2779//============================================================================================================================== 2780 void StpDilH(out StpH1 oC, StpU2 pp, StpU4 con0) { 2781 StpF2 kRcpR = StpF2_U2(con0.xy); 2782 StpF2 p = StpF2(pp) * kRcpR; 2783//------------------------------------------------------------------------------------------------------------------------------ 2784 #if STP_BUG_BW_SOL 2785 { oC = StpDilCon4H(p).x; return; } 2786 #endif // STP_BUG_BW_SOL 2787//------------------------------------------------------------------------------------------------------------------------------ 2788 // Gather. 2789 // 0 1 2 2790 // 2791 // 3 4 5 2792 // 2793 // 6 7 8 2794 // For. 2795 // w z w z w z 2796 // x y.x y x y 2797 // w z[w]z w z 2798 // x y x y x y 2799 // w z w z w z 2800 // x y x y x y 2801 #if STP_OFFSETS 2802 StpH4 g0 = StpDilCon4OH(p, StpI2(-1.0, -1.0)); 2803 StpH4 g1 = StpDilCon4OH(p, StpI2( 1.0, -1.0)); 2804 StpH4 g2 = StpDilCon4OH(p, StpI2( 3.0, -1.0)); 2805 StpH4 g3 = StpDilCon4OH(p, StpI2(-1.0, 1.0)); 2806 StpH4 g4 = StpDilCon4OH(p, StpI2( 1.0, 1.0)); 2807 StpH4 g5 = StpDilCon4OH(p, StpI2( 3.0, 1.0)); 2808 StpH4 g6 = StpDilCon4OH(p, StpI2(-1.0, 3.0)); 2809 StpH4 g7 = StpDilCon4OH(p, StpI2( 1.0, 3.0)); 2810 StpH4 g8 = StpDilCon4OH(p, StpI2( 3.0, 3.0)); 2811 #else // STP_OFFSETS 2812 StpH4 g0 = StpDilCon4H(p + StpF2(-1.0 * kRcpR.x, -1.0 * kRcpR.y)); 2813 StpH4 g1 = StpDilCon4H(p + StpF2( 1.0 * kRcpR.x, -1.0 * kRcpR.y)); 2814 StpH4 g2 = StpDilCon4H(p + StpF2( 3.0 * kRcpR.x, -1.0 * kRcpR.y)); 2815 StpH4 g3 = StpDilCon4H(p + StpF2(-1.0 * kRcpR.x, 1.0 * kRcpR.y)); 2816 StpH4 g4 = StpDilCon4H(p + StpF2( 1.0 * kRcpR.x, 1.0 * kRcpR.y)); 2817 StpH4 g5 = StpDilCon4H(p + StpF2( 3.0 * kRcpR.x, 1.0 * kRcpR.y)); 2818 StpH4 g6 = StpDilCon4H(p + StpF2(-1.0 * kRcpR.x, 3.0 * kRcpR.y)); 2819 StpH4 g7 = StpDilCon4H(p + StpF2( 1.0 * kRcpR.x, 3.0 * kRcpR.y)); 2820 StpH4 g8 = StpDilCon4H(p + StpF2( 3.0 * kRcpR.x, 3.0 * kRcpR.y)); 2821 #endif // STP_OFFSETS 2822//------------------------------------------------------------------------------------------------------------------------------ 2823 // Rename 2824 // a b c d e 2825 // f g h i j 2826 // k l m n o 2827 // p q r s t 2828 // u v w x y 2829 StpH1 cA = g0.w; 2830 StpH1 cB = g0.z; 2831 StpH1 cC = g1.w; 2832 StpH1 cD = g1.z; 2833 StpH1 cE = g2.w; 2834 StpH1 cF = g0.x; 2835 StpH1 cG = g0.y; 2836 StpH1 cH = g1.x; 2837 StpH1 cI = g1.y; 2838 StpH1 cJ = g2.x; 2839 StpH1 cK = g3.w; 2840 StpH1 cL = g3.z; 2841 StpH1 cM = g4.w; 2842 StpH1 cN = g4.z; 2843 StpH1 cO = g5.w; 2844 StpH1 cP = g3.x; 2845 StpH1 cQ = g3.y; 2846 StpH1 cR = g4.x; 2847 StpH1 cS = g4.y; 2848 StpH1 cT = g5.x; 2849 StpH1 cU = g6.w; 2850 StpH1 cV = g6.z; 2851 StpH1 cW = g7.w; 2852 StpH1 cX = g7.z; 2853 StpH1 cY = g8.w; 2854//------------------------------------------------------------------------------------------------------------------------------ 2855 // 5 point min. 2856 // . 1 . 2857 // 3 4 5 2858 // . 7 . 2859 StpH4 m1345; 2860 m1345.x = StpMin3H1(StpMin3H1(cG, cH, cI), cC, cM); 2861 m1345.y = StpMin3H1(StpMin3H1(cK, cL, cM), cG, cQ); 2862 m1345.z = StpMin3H1(StpMin3H1(cL, cM, cN), cH, cR); 2863 m1345.w = StpMin3H1(StpMin3H1(cM, cN, cO), cI, cS); 2864 StpH1 m7 = StpMin3H1(StpMin3H1(cQ, cR, cS), cM, cW); 2865//------------------------------------------------------------------------------------------------------------------------------ 2866 StpH1 b0 = StpH1_(0.5); 2867 StpH1 b1 = (StpH1_(1.0) - b0) * StpH1_(0.25); 2868 oC = m1345.z * b0 + m1345.x * b1 + m1345.y * b1 + m1345.w * b1 + m7 * b1; } 2869#endif // defined(STP_GPU) && defined(STP_16BIT) && defined(STP_DIL) 2870//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2871//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2872//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2873//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2874//_____________________________________________________________.._______________________________________________________________ 2875//============================================================================================================================== 2876// 2877// SPATIAL ANTI-ALIASING ENTRY POINT 2878// 2879//------------------------------------------------------------------------------------------------------------------------------ 2880// This should be pass merged with STP_DIL. 2881// It's a shell, GEAA is separated as a modified form could be useful on its own. 2882//============================================================================================================================== 2883#if defined(STP_GPU) && defined(STP_32BIT) && defined(STP_SAA) 2884 StpMF4 StpSaaLum4F(StpF2 p); 2885 #if STP_OFFSETS 2886 StpMF4 StpSaaLum4OF(StpF2 p, StpI2 o); 2887 #endif 2888//------------------------------------------------------------------------------------------------------------------------------ 2889 #define STP_GEAA 1 2890 StpMF4 StpGeaa4F(StpF2 p) { return StpSaaLum4F(p); } 2891 #if STP_OFFSETS 2892 StpMF4 StpGeaa4OF(StpF2 p, StpI2 o) { return StpSaaLum4OF(p, o); } 2893 #endif 2894 void StpGeaaF(out StpMF1 gW, out StpMF1 gLuma, out StpF2 gFilter, out StpF2 gDilate, StpF2 p, StpF2 kRcpI, StpF2 kHalfRcpI); 2895//============================================================================================================================== 2896 void StpSaaF(out StpMF1 oN, StpU2 pp, StpU4 con0) { 2897//------------------------------------------------------------------------------------------------------------------------------ 2898 StpF2 kRcpC = StpF2_U2(con0.xy); 2899 StpF2 kHalfRcpC = StpF2_U2(con0.zw); 2900//------------------------------------------------------------------------------------------------------------------------------ 2901 StpF2 p = StpF2(pp) * kRcpC + kHalfRcpC; 2902//------------------------------------------------------------------------------------------------------------------------------ 2903 #if STP_BUG_BW_SOL 2904 { oN = StpSaaLum4F(p).x; return; } 2905 #endif // STP_BUG_BW_SOL 2906//------------------------------------------------------------------------------------------------------------------------------ 2907 StpMF1 gLuma; 2908 StpMF1 gNe; 2909 StpF2 gFilter; 2910 StpF2 gDilate; 2911 StpGeaaF(oN, gLuma, gFilter, gDilate, p, kRcpC, kHalfRcpC); } 2912#endif // defined(STP_GPU) && defined(STP_32BIT) && defined(STP_SAA) 2913//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2914//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2915//_____________________________________________________________.._______________________________________________________________ 2916//============================================================================================================================== 2917// 16-BIT PATH 2918//============================================================================================================================== 2919#if defined(STP_GPU) && defined(STP_16BIT) && defined(STP_SAA) 2920 // Gather4 on current luma. 2921 StpH4 StpSaaLum4H(StpF2 p); 2922 #if STP_OFFSETS 2923 StpH4 StpSaaLum4OH(StpF2 p, StpI2 o); 2924 #endif 2925//------------------------------------------------------------------------------------------------------------------------------ 2926 #define STP_GEAA 1 2927 StpH4 StpGeaa4H(StpF2 p) { return StpSaaLum4H(p); } 2928 #if STP_OFFSETS 2929 StpH4 StpGeaa4OH(StpF2 p, StpI2 o) { return StpSaaLum4OH(p, o); } 2930 #endif 2931 void StpGeaaH(out StpH1 gW, out StpH1 gLuma, out StpF2 gFilter, out StpF2 gDilate, StpF2 p, StpF2 kRcpI, StpF2 kHalfRcpI); 2932//============================================================================================================================== 2933 void StpSaaH( 2934 out StpH1 oN, // Output control (to be stored). 2935 StpU2 pp, // Input position {0 to size-1} across the input frame. 2936 StpU4 con0) { // Shared, first constant generated by StpPatCon(). 2937//------------------------------------------------------------------------------------------------------------------------------ 2938 StpF2 kRcpC = StpF2_U2(con0.xy); 2939 StpF2 kHalfRcpC = StpF2_U2(con0.zw); 2940//------------------------------------------------------------------------------------------------------------------------------ 2941 // Float position {0 to 1} across screen. 2942 StpF2 p = StpF2(pp) * kRcpC + kHalfRcpC; 2943//------------------------------------------------------------------------------------------------------------------------------ 2944 #if STP_BUG_BW_SOL 2945 { oN = StpSaaLum4H(p).x; return; } 2946 #endif // STP_BUG_BW_SOL 2947//------------------------------------------------------------------------------------------------------------------------------ 2948 StpH1 gLuma; // Spatial AA (unused). 2949 StpH1 gNe; // Output spatial neighborhood (unused). 2950 StpF2 gFilter; // Output position for anti-aliased color sampling if standalone (unused). 2951 StpF2 gDilate; // Output for {z,motion} dilation (unused). 2952 StpGeaaH(oN, gLuma, gFilter, gDilate, p, kRcpC, kHalfRcpC); } 2953#endif // defined(STP_GPU) && defined(STP_16BIT) && defined(STP_SAA) 2954//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2955//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2956//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2957//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2958//_____________________________________________________________.._______________________________________________________________ 2959//============================================================================================================================== 2960// 2961// SCALING TAA ENTRY POINT 2962// 2963//============================================================================================================================== 2964#if defined(STP_GPU) && defined(STP_TAA) && defined(STP_32BIT) 2965 StpMF4 StpTaaCtl4F(StpF2 p); 2966//------------------------------------------------------------------------------------------------------------------------------ 2967 StpMF4 StpTaaCol4RF(StpF2 p); 2968 StpMF4 StpTaaCol4GF(StpF2 p); 2969 StpMF4 StpTaaCol4BF(StpF2 p); 2970 StpMF4 StpTaaCol4AF(StpF2 p); 2971//------------------------------------------------------------------------------------------------------------------------------ 2972 StpMF1 StpTaaConF(StpF2 p); 2973//------------------------------------------------------------------------------------------------------------------------------ 2974 StpMF1 StpTaaDitF(StpMU2 o); 2975//------------------------------------------------------------------------------------------------------------------------------ 2976 StpU4 StpTaaMot4F(StpF2 p); 2977//------------------------------------------------------------------------------------------------------------------------------ 2978 StpMF4 StpTaaPriFedF(StpF2 p); 2979 StpMF4 StpTaaPriFed4RF(StpF2 p); 2980 StpMF4 StpTaaPriFed4GF(StpF2 p); 2981 StpMF4 StpTaaPriFed4BF(StpF2 p); 2982 #if STP_MAX_MIN_10BIT 2983 StpMF4 StpTaaPriFedMaxF(StpF2 p); 2984 StpMF4 StpTaaPriFedMinF(StpF2 p); 2985 #endif // STP_MAX_MIN_10BIT 2986 #if STP_OFFSETS 2987 StpMF4 StpTaaPriFedOF(StpF2 p, StpI2 o); 2988 StpMF4 StpTaaPriFed4ROF(StpF2 p, StpI2 o); 2989 StpMF4 StpTaaPriFed4GOF(StpF2 p, StpI2 o); 2990 StpMF4 StpTaaPriFed4BOF(StpF2 p, StpI2 o); 2991 #endif // STP_OFFSETS 2992//============================================================================================================================== 2993 void StpTaaF( 2994 StpMU1 lane, 2995 StpMU2 o, 2996 out StpMF4 rF, 2997 out StpMF4 rW, 2998 StpU4 con0, 2999 StpU4 con1, 3000 StpU4 con2, 3001 StpU4 con3) { 3002//------------------------------------------------------------------------------------------------------------------------------ 3003 StpMF1 dit = StpTaaDitF(o); 3004//------------------------------------------------------------------------------------------------------------------------------ 3005 StpF2 kCRcpF = StpF2_U2(con0.xy); 3006 StpF2 kHalfCRcpFUnjitC = StpF2_U2(con0.zw); 3007 StpF2 kRcpC = StpF2_U2(con1.xy); 3008 StpF2 kRcpF = StpF2_U2(con1.zw); 3009 StpF2 kHalfRcpF = StpF2_U2(con2.xy); 3010 StpF2 kJitCRcpC0 = StpF2_U2(con2.zw); 3011 StpF2 kHalfRcpC = StpF2_U2(con3.xy); 3012 StpF2 kF = StpF2_U2(con3.zw); 3013//------------------------------------------------------------------------------------------------------------------------------ 3014 #if STP_BUG_BW_SOL 3015 { StpF2 oo = StpF2(o) * kRcpF; 3016 StpMF4 g4 = StpTaaCtl4RF(oo); 3017 StpU4 m4 = StpTaaMot4F(oo); 3018 StpMF1 cnv = StpTaaConF(oo); 3019 StpMF4 f = StpTaaPriFedF(oo); 3020 StpMF4 c4R = StpTaaCol4RF(oo); 3021 rW = rF = l4 + g4 + StpMF4(m4) + StpMF4_(cnv) + f + c4R; 3022 return; } 3023 #endif // STP_BUG_BW_SOL 3024//------------------------------------------------------------------------------------------------------------------------------ 3025 StpF2 oI = StpF2(o); 3026 StpF2 oC = oI * kCRcpF + kHalfCRcpFUnjitC; 3027 StpF2 oCNW = floor(oC + StpF2_(-0.5)); 3028 StpF2 oC4 = oCNW * kRcpC + kRcpC; 3029 StpF2 oC1 = oC * kRcpC; 3030//============================================================================================================================== 3031// FETCH {CONVERGENCE, COLOR, CONTROL, Z+MOTION} 3032//============================================================================================================================== 3033 StpMF1 cnv = StpTaaConF(oC1); 3034 StpMF4 c4R = StpTaaCol4RF(oC4); 3035 StpMF4 c4G = StpTaaCol4GF(oC4); 3036 StpMF4 c4B = StpTaaCol4BF(oC4); 3037 StpMF4 c4A = StpTaaCol4AF(oC4); 3038 StpMF4 g4 = StpTaaCtl4F(oC4); 3039 StpU4 m4 = StpTaaMot4F(oC4); 3040//------------------------------------------------------------------------------------------------------------------------------ 3041// INDEPENDENT 3042//------------------------------------------------------------------------------------------------------------------------------ 3043 StpMF2 rP = StpMF2(oC - oCNW) - StpMF2_(0.5); 3044//------------------------------------------------------------------------------------------------------------------------------ 3045 StpMF2 rPX10 = StpMF2(1.0, 0.0) + StpMF2(-rP.x, rP.x); 3046 StpMF2 rPY01 = StpMF2(0.0, 1.0) + StpMF2(rP.y, -rP.y); 3047 StpMF4 pen4x = StpMF4(rPX10.g, rPX10.r, rPX10.r, rPX10.g); 3048 StpMF4 pen4y = StpMF4(rPY01.g, rPY01.g, rPY01.r, rPY01.r); 3049 StpMF4 pen4 = StpSatMF4(pen4x * pen4x + pen4y * pen4y); 3050//============================================================================================================================== 3051// DEPENDENT ON {CONVERGENCE} 3052//============================================================================================================================== 3053 cnv = StpSatMF1(cnv - StpMF1_(1.0 / STP_FRAME_MAX)); 3054//------------------------------------------------------------------------------------------------------------------------------ 3055 StpMF1 pen = StpMF1_(cnv) * StpMF1_(STP_FRAME_MAX) + StpMF1_(1.0); 3056 pen = StpPrxLoSqrtMF1(pen); 3057 pen4 = StpSatMF4(StpMF4_(1.0) - pen4 * StpMF4_(pen)); 3058 #if defined(STP_16BIT) 3059 #else // defined(STP_16BIT) 3060 pen = StpSatMF1(pen4.x * pen4.x + pen4.y * pen4.y + pen4.z * pen4.z + pen4.w * pen4.w); 3061 #endif // defined(STP_16BIT) 3062//============================================================================================================================== 3063// DEPENDENT ON {COLOR} 3064//============================================================================================================================== 3065 StpMF4 wG; 3066 StpMF4 l4 = c4R + c4G * StpMF4_(2.0) + c4B; 3067 StpMF2 difST = abs(l4.gr - l4.ab); 3068 StpP1 useS = difST.x > difST.y; 3069 StpMF2 wTrb = StpSatMF2(StpMF2(-rP.x, rP.x) + StpMF2(rP.y, -rP.y)); 3070 StpMF2 wSrb = min(rPX10, rPY01); 3071 if(useS) wTrb = wSrb; 3072 StpMF2 wTga = rPY01 - wTrb; 3073 wG.rg = StpMF2(wTrb.x, wTga.x); 3074 wG.ba = StpMF2(wTrb.y, wTga.y); 3075 wG *= wG; 3076 wG *= wG; 3077//------------------------------------------------------------------------------------------------------------------------------ 3078 wG *= g4; 3079 StpMF4 triMask = StpMF4_(1.0); 3080 StpMF2 wGmin2 = min(wG.xy, wG.zw); 3081//============================================================================================================================== 3082// DEPENDENT ON {Z,MOTION} 3083//============================================================================================================================== 3084 if(wGmin2.x < wGmin2.y) { 3085 if(wG.x < wG.z) { triMask.x = StpMF1_(STP_TAA_TRI_MASK_AVOID); m4.x = 0xFFFFFFFF; } 3086 else { triMask.z = StpMF1_(STP_TAA_TRI_MASK_AVOID); m4.z = 0xFFFFFFFF; } } 3087 else { 3088 if(wG.y < wG.w) { triMask.y = StpMF1_(STP_TAA_TRI_MASK_AVOID); m4.y = 0xFFFFFFFF; } 3089 else { triMask.w = StpMF1_(STP_TAA_TRI_MASK_AVOID); m4.w = 0xFFFFFFFF; } } 3090 StpU1 m1 = min(StpMin3U1(m4.x, m4.y, m4.z), m4.w); 3091//------------------------------------------------------------------------------------------------------------------------------ 3092 wG *= triMask; 3093//------------------------------------------------------------------------------------------------------------------------------ 3094 StpF2 mXY; 3095 StpMvUnpackV(mXY, m1); 3096//============================================================================================================================== 3097// GET ALL FEEDBACK FILTERING DONE 3098//============================================================================================================================== 3099 StpF2 oF = oI * kRcpF + kHalfRcpF - mXY; 3100//------------------------------------------------------------------------------------------------------------------------------ 3101 StpMF3 f; 3102 #if STP_TAA_PRX_LANCZOS 3103 StpF2 oM = oI + StpF2_(0.5) - mXY * kF; 3104 StpF2 oMNW = floor(oM + StpF2_(-0.5)); 3105 StpF2 oM4 = oMNW * kRcpF + kRcpF; 3106 StpMF3 fMax, fMin; 3107 #else // STP_TAA_PRX_LANCZOS 3108 f = StpTaaPriFedF(oF).rgb; 3109 #endif // STP_TAA_PRX_LANCZOS 3110//============================================================================================================================== 3111 #if (STP_TAA_PRX_LANCZOS == 1) 3112 #if STP_OFFSETS 3113 StpF2 oM0 = StpF2(oF.x, oM4.y + kRcpF.y * StpF1_(-1.5)); 3114 StpMF3 f0 = StpTaaPriFedF(oM0).rgb; 3115 StpMF3 f1 = StpTaaPriFedOF(oM0, StpI2(0, 1)).rgb; 3116 StpMF3 f2 = StpTaaPriFedOF(oM0, StpI2(0, 2)).rgb; 3117 StpMF3 f3 = StpTaaPriFedOF(oM0, StpI2(0, 3)).rgb; 3118 #else // STP_OFFSETS 3119 StpF2 oM0 = StpF2(oF.x, oM4.y + kRcpF.y * StpF1_(-1.5)); 3120 StpF2 oM1 = StpF2(oF.x, oM4.y + kRcpF.y * StpF1_(-0.5)); 3121 StpF2 oM2 = StpF2(oF.x, oM4.y + kRcpF.y * StpF1_( 0.5)); 3122 StpF2 oM3 = StpF2(oF.x, oM4.y + kRcpF.y * StpF1_( 1.5)); 3123 StpMF3 f0 = StpTaaPriFedF(oM0).rgb; 3124 StpMF3 f1 = StpTaaPriFedF(oM1).rgb; 3125 StpMF3 f2 = StpTaaPriFedF(oM2).rgb; 3126 StpMF3 f3 = StpTaaPriFedF(oM3).rgb; 3127 #endif // STP_OFFSETS 3128 #if (STP_MAX_MIN_10BIT && STP_TAA_PRX_LANCZOS_DERING) 3129 fMax = StpTaaPriFedMaxF(oM4).rgb; 3130 fMin = StpTaaPriFedMinF(oM4).rgb; 3131 #endif // (STP_MAX_MIN_10BIT && STP_TAA_PRX_LANCZOS_DERING) 3132 #if ((STP_MAX_MIN_10BIT == 0) && STP_TAA_PRX_LANCZOS_DERING) 3133 StpMF4 f4R = StpTaaPriFed4RF(oM4); 3134 StpMF4 f4G = StpTaaPriFed4GF(oM4); 3135 StpMF4 f4B = StpTaaPriFed4BF(oM4); 3136 #endif // ((STP_MAX_MIN_10BIT == 0) && STP_TAA_PRX_LANCZOS_DERING) 3137//------------------------------------------------------------------------------------------------------------------------------ 3138// INDEPENDENT 3139//------------------------------------------------------------------------------------------------------------------------------ 3140 StpMF2 fP = StpMF2(oM - oMNW); 3141 StpMF4 fPY = StpMF4_(-fP.y * StpMF1_(0.5)) + StpMF4(-0.5 * 0.5, 0.5 * 0.5, 1.5 * 0.5, 2.5 * 0.5); 3142 fPY = StpSatMF4(StpMF4_(1.0) - fPY * fPY); 3143 fPY *= fPY; 3144 StpMF4 fPY4 = fPY * fPY; 3145 fPY = (StpMF4_(1.0 + 81.0 / 175.0) * fPY4 - StpMF4_(81.0 / 175.0)) * fPY; 3146 #if defined(STP_16BIT) 3147 #else // defined(STP_16BIT) 3148 StpMF1 fRcp = StpPrxLoRcpMF1(fPY.r + fPY.g + fPY.b + fPY.a); 3149 #endif // defined(STP_16BIT) 3150//------------------------------------------------------------------------------------------------------------------------------ 3151// DEPENDENT 3152//------------------------------------------------------------------------------------------------------------------------------ 3153 f.rgb = f0 * StpMF3_(fPY.r) + f1 * StpMF3_(fPY.g) + f2 * StpMF3_(fPY.b) + f3 * StpMF3_(fPY.a); 3154 f.rgb *= StpMF3_(fRcp); 3155 #if STP_TAA_PRX_LANCZOS_DERING 3156 #if (STP_MAX_MIN_10BIT == 0) 3157 #if defined(STP_16BIT) 3158 #else // defined(STP_16BIT) 3159 fMax.r = max(StpMax3MF1(f4R.x, f4R.y, f4R.z), f4R.w); 3160 fMax.g = max(StpMax3MF1(f4G.x, f4G.y, f4G.z), f4G.w); 3161 fMax.b = max(StpMax3MF1(f4B.x, f4B.y, f4B.z), f4B.w); 3162 fMin.r = min(StpMin3MF1(f4R.x, f4R.y, f4R.z), f4R.w); 3163 fMin.g = min(StpMin3MF1(f4G.x, f4G.y, f4G.z), f4G.w); 3164 fMin.b = min(StpMin3MF1(f4B.x, f4B.y, f4B.z), f4B.w); 3165 f = clamp(f, fMin, fMax); 3166 #endif // defined(STP_16BIT) 3167 #else // (STP_MAX_MIN_10BIT == 0) 3168 f = clamp(f, fMin, fMax); 3169 #endif // (STP_MAX_MIN_10BIT == 0) 3170 #endif // STP_TAA_PRX_LANCZOS_DERING 3171 #endif // (STP_TAA_PRX_LANCZOS == 1) 3172//============================================================================================================================== 3173 #if (STP_TAA_PRX_LANCZOS == 2) 3174 #if STP_OFFSETS 3175 StpMF4 f4R0 = StpTaaPriFed4ROF(oM4, StpI2(-1, -1)); 3176 StpMF4 f4G0 = StpTaaPriFed4GOF(oM4, StpI2(-1, -1)); 3177 StpMF4 f4B0 = StpTaaPriFed4BOF(oM4, StpI2(-1, -1)); 3178 StpMF4 f4R1 = StpTaaPriFed4ROF(oM4, StpI2( 1, -1)); 3179 StpMF4 f4G1 = StpTaaPriFed4GOF(oM4, StpI2( 1, -1)); 3180 StpMF4 f4B1 = StpTaaPriFed4BOF(oM4, StpI2( 1, -1)); 3181 StpMF4 f4R2 = StpTaaPriFed4ROF(oM4, StpI2(-1, 1)); 3182 StpMF4 f4G2 = StpTaaPriFed4GOF(oM4, StpI2(-1, 1)); 3183 StpMF4 f4B2 = StpTaaPriFed4BOF(oM4, StpI2(-1, 1)); 3184 StpMF4 f4R3 = StpTaaPriFed4ROF(oM4, StpI2( 1, 1)); 3185 StpMF4 f4G3 = StpTaaPriFed4GOF(oM4, StpI2( 1, 1)); 3186 StpMF4 f4B3 = StpTaaPriFed4BOF(oM4, StpI2( 1, 1)); 3187 #else // STP_OFFSETS 3188 StpF2 oM0 = oM4 + StpF2(-kRcpF.x, -kRcpF.y); 3189 StpF2 oM1 = oM4 + StpF2( kRcpF.x, -kRcpF.y); 3190 StpF2 oM2 = oM4 + StpF2(-kRcpF.x, kRcpF.y); 3191 StpF2 oM3 = oM4 + StpF2( kRcpF.x, kRcpF.y); 3192 StpMF4 f4R0 = StpTaaPriFed4RF(oM0); 3193 StpMF4 f4G0 = StpTaaPriFed4GF(oM0); 3194 StpMF4 f4B0 = StpTaaPriFed4BF(oM0); 3195 StpMF4 f4R1 = StpTaaPriFed4RF(oM1); 3196 StpMF4 f4G1 = StpTaaPriFed4GF(oM1); 3197 StpMF4 f4B1 = StpTaaPriFed4BF(oM1); 3198 StpMF4 f4R2 = StpTaaPriFed4RF(oM2); 3199 StpMF4 f4G2 = StpTaaPriFed4GF(oM2); 3200 StpMF4 f4B2 = StpTaaPriFed4BF(oM2); 3201 StpMF4 f4R3 = StpTaaPriFed4RF(oM3); 3202 StpMF4 f4G3 = StpTaaPriFed4GF(oM3); 3203 StpMF4 f4B3 = StpTaaPriFed4BF(oM3); 3204 #endif // STP_OFFSETS 3205 #if (STP_MAX_MIN_10BIT && STP_TAA_PRX_LANCZOS_DERING) 3206 fMax = StpTaaPriFedMaxF(oM4).rgb; 3207 fMin = StpTaaPriFedMinF(oM4).rgb; 3208 #endif // (STP_MAX_MIN_10BIT && STP_TAA_PRX_LANCZOS_DERING) 3209//------------------------------------------------------------------------------------------------------------------------------ 3210// INDEPENDENT 3211//------------------------------------------------------------------------------------------------------------------------------ 3212 StpMF2 fP = StpMF2(oM - oMNW); 3213 StpMF4 fPX = StpMF4_(-fP.x * StpMF1_(0.5)) + StpMF4(-0.5 * 0.5, 0.5 * 0.5, 1.5 * 0.5, 2.5 * 0.5); 3214 StpMF4 fPY = StpMF4_(-fP.y * StpMF1_(0.5)) + StpMF4(-0.5 * 0.5, 0.5 * 0.5, 1.5 * 0.5, 2.5 * 0.5); 3215 fPX = StpSatMF4(StpMF4_(1.0) - fPX * fPX); 3216 fPY = StpSatMF4(StpMF4_(1.0) - fPY * fPY); 3217 fPX *= fPX; 3218 fPY *= fPY; 3219 StpMF4 fPX4 = fPX * fPX; 3220 StpMF4 fPY4 = fPY * fPY; 3221 fPX = (StpMF4_(1.0 + 81.0 / 175.0) * fPX4 - StpMF4_(81.0 / 175.0)) * fPX; 3222 fPY = (StpMF4_(1.0 + 81.0 / 175.0) * fPY4 - StpMF4_(81.0 / 175.0)) * fPY; 3223 #if defined(STP_16BIT) 3224 #else // defined(STP_16BIT) 3225 fPX *= StpMF4_(StpPrxLoRcpMF1(fPX.r + fPX.g + fPX.b + fPX.a)); 3226 fPY *= StpMF4_(StpPrxLoRcpMF1(fPY.r + fPY.g + fPY.b + fPY.a)); 3227 #endif // defined(STP_16BIT) 3228 StpMF4 fPX0 = fPX * StpMF4_(fPY.r); 3229 StpMF4 fPX1 = fPX * StpMF4_(fPY.g); 3230 StpMF4 fPX2 = fPX * StpMF4_(fPY.b); 3231 StpMF4 fPX3 = fPX * StpMF4_(fPY.a); 3232//------------------------------------------------------------------------------------------------------------------------------ 3233// DEPENDENT 3234//------------------------------------------------------------------------------------------------------------------------------ 3235 #if defined(STP_16BIT) 3236 #else // defined(STP_16BIT) 3237 f.r = f4R0.w * fPX0.r + f4R0.z * fPX0.g + f4R1.w * fPX0.b + f4R1.z * fPX0.a + 3238 f4R0.x * fPX1.r + f4R0.y * fPX1.g + f4R1.x * fPX1.b + f4R1.y * fPX1.a + 3239 f4R2.w * fPX2.r + f4R2.z * fPX2.g + f4R3.w * fPX2.b + f4R3.z * fPX2.a + 3240 f4R2.x * fPX3.r + f4R2.y * fPX3.g + f4R3.x * fPX3.b + f4R3.y * fPX3.a; 3241 f.g = f4G0.w * fPX0.r + f4G0.z * fPX0.g + f4G1.w * fPX0.b + f4G1.z * fPX0.a + 3242 f4G0.x * fPX1.r + f4G0.y * fPX1.g + f4G1.x * fPX1.b + f4G1.y * fPX1.a + 3243 f4G2.w * fPX2.r + f4G2.z * fPX2.g + f4G3.w * fPX2.b + f4G3.z * fPX2.a + 3244 f4G2.x * fPX3.r + f4G2.y * fPX3.g + f4G3.x * fPX3.b + f4G3.y * fPX3.a; 3245 f.b = f4B0.w * fPX0.r + f4B0.z * fPX0.g + f4B1.w * fPX0.b + f4B1.z * fPX0.a + 3246 f4B0.x * fPX1.r + f4B0.y * fPX1.g + f4B1.x * fPX1.b + f4B1.y * fPX1.a + 3247 f4B2.w * fPX2.r + f4B2.z * fPX2.g + f4B3.w * fPX2.b + f4B3.z * fPX2.a + 3248 f4B2.x * fPX3.r + f4B2.y * fPX3.g + f4B3.x * fPX3.b + f4B3.y * fPX3.a; 3249 #endif // defined(STP_16BIT) 3250 #if STP_TAA_PRX_LANCZOS_DERING 3251 #if (STP_MAX_MIN_10BIT == 0) 3252 #if defined(STP_16BIT) 3253 #else // defined(STP_16BIT) 3254 fMax.r = max(StpMax3MF1(f4R0.y, f4R1.x, f4R2.z), f4R3.w); 3255 fMax.g = max(StpMax3MF1(f4G0.y, f4G1.x, f4G2.z), f4G3.w); 3256 fMax.b = max(StpMax3MF1(f4B0.y, f4B1.x, f4B2.z), f4B3.w); 3257 fMin.r = min(StpMin3MF1(f4R0.y, f4R1.x, f4R2.z), f4R3.w); 3258 fMin.g = min(StpMin3MF1(f4G0.y, f4G1.x, f4G2.z), f4G3.w); 3259 fMin.b = min(StpMin3MF1(f4B0.y, f4B1.x, f4B2.z), f4B3.w); 3260 f = clamp(f, fMin, fMax); 3261 #endif // defined(STP_16BIT) 3262 #else // (STP_MAX_MIN_10BIT == 0) 3263 f = clamp(f, fMin, fMax); 3264 #endif // (STP_MAX_MIN_10BIT == 0) 3265 #endif // STP_TAA_PRX_LANCZOS_DERING 3266 #endif // (STP_TAA_PRX_LANCZOS == 2) 3267//============================================================================================================================== 3268// DISPLACEMENT 3269//============================================================================================================================== 3270 StpF2 oD0 = oC4 + kJitCRcpC0 - mXY; 3271 StpF2 oD1 = StpF2(kRcpC.x, 0.0) + oD0; 3272 StpF2 oD2 = StpF2(kRcpC.x, -kRcpC.y) + oD0; 3273 StpF2 oD3 = StpF2(0.0, -kRcpC.y) + oD0; 3274 StpMF3 d0 = StpTaaPriFedF(oD0).rgb; 3275 StpMF3 d1 = StpTaaPriFedF(oD1).rgb; 3276 StpMF3 d2 = StpTaaPriFedF(oD2).rgb; 3277 StpMF3 d3 = StpTaaPriFedF(oD3).rgb; 3278//------------------------------------------------------------------------------------------------------------------------------ 3279// INDEPENDENT 3280//------------------------------------------------------------------------------------------------------------------------------ 3281 #if defined(STP_16BIT) 3282 #else // defined(STP_16BIT) 3283 wG = StpSatMF4(wG * StpMF4_(StpPrxLoRcpMF1(wG.x + wG.y + wG.z + wG.w))); 3284 #endif // defined(STP_16BIT) 3285//------------------------------------------------------------------------------------------------------------------------------ 3286 StpMF4 wT = abs(c4R - StpMF4_(f.r)) * StpMF4_(STP_LUMA_R) + 3287 abs(c4G - StpMF4_(f.g)) * StpMF4_(STP_LUMA_G) + 3288 abs(c4B - StpMF4_(f.b)) * StpMF4_(STP_LUMA_B); 3289 wT = StpPrxLoRcpMF4(wT * StpMF4_(STP_ANTI_MAX) + StpMF4_(STP_ANTI_MIN)) * triMask; 3290//------------------------------------------------------------------------------------------------------------------------------ 3291 #if defined(STP_16BIT) 3292 #else // defined(STP_16BIT) 3293 wT = StpSatMF4(wT * StpMF4_(StpPrxLoRcpMF1(wT.x + wT.y + wT.z + wT.w))); 3294 #endif // defined(STP_16BIT) 3295//------------------------------------------------------------------------------------------------------------------------------ 3296 StpMF4 wM = wT * StpMF4_(0.5) + wG * StpMF4_(0.5); 3297 #if defined(STP_16BIT) 3298 #else // defined(STP_16BIT) 3299 StpMF1 match = c4A.x * wM.x + c4A.y * wM.y + c4A.z * wM.z + c4A.w * wM.w; 3300 #endif // defined(STP_16BIT) 3301 cnv *= match; 3302//------------------------------------------------------------------------------------------------------------------------------ 3303// DEPENDENT 3304//------------------------------------------------------------------------------------------------------------------------------ 3305 StpMF3 dG = d0 * StpMF3_(wG.x) + d1 * StpMF3_(wG.y) + d2 * StpMF3_(wG.z) + d3 * StpMF3_(wG.w); 3306 StpMF3 dT = d0 * StpMF3_(wT.x) + d1 * StpMF3_(wT.y) + d2 * StpMF3_(wT.z) + d3 * StpMF3_(wT.w); 3307//------------------------------------------------------------------------------------------------------------------------------ 3308 #if defined(STP_16BIT) 3309 #else // defined(STP_16BIT) 3310 StpMF3 t = StpMF3( 3311 c4R.x * wT.x + c4R.y * wT.y + c4R.z * wT.z + c4R.w * wT.w, 3312 c4G.x * wT.x + c4G.y * wT.y + c4G.z * wT.z + c4G.w * wT.w, 3313 c4B.x * wT.x + c4B.y * wT.y + c4B.z * wT.z + c4B.w * wT.w); 3314 StpMF3 c = StpMF3( 3315 c4R.x * wG.x + c4R.y * wG.y + c4R.z * wG.z + c4R.w * wG.w, 3316 c4G.x * wG.x + c4G.y * wG.y + c4G.z * wG.z + c4G.w * wG.w, 3317 c4B.x * wG.x + c4B.y * wG.y + c4B.z * wG.z + c4B.w * wG.w); 3318 #endif // defined(STP_16BIT) 3319//------------------------------------------------------------------------------------------------------------------------------ 3320 StpMF1 bln = StpSatMF1(cnv * StpPrxLoRcpMF1(cnv + StpMF1_(1.0 / STP_FRAME_MAX))); 3321 StpMF1 blnT = StpMF1_(1.0) - bln; 3322 StpMF3 b = f * StpMF3_(bln) + t * StpMF3_(blnT); 3323 StpMF3 minNe = min(c, b); 3324 StpMF3 maxNe = max(c, b); 3325//------------------------------------------------------------------------------------------------------------------------------ 3326 StpMF3 penC = StpSatMF3(c + (f - dG) * StpMF3_(StpMF1_(0.9875) * match)); 3327 StpMF2 penWF; 3328 penWF.x = pen * StpMF1_(STP_TAA_PEN_W); 3329 penWF.y = pen * lerp(StpMF1_(STP_TAA_PEN_F0), StpMF1_(STP_TAA_PEN_F1), cnv); 3330 StpMF2 penNotWF = StpMF2_(1.0) - penWF; 3331 rF.rgb = t + (f - dT); 3332 rF.rgb = rF.rgb * StpMF3_(blnT) + f * StpMF3_(bln); 3333 rW.rgb = StpSatMF3(rF.rgb * StpMF3_(penNotWF.x) + penC * StpMF3_(penWF.x)); 3334 rF.rgb = StpSatMF3(rF.rgb * StpMF3_(penNotWF.y) + penC * StpMF3_(penWF.y)); 3335 rW.rgb = clamp(rW.rgb, minNe, maxNe); 3336 rF.rgb = clamp(rF.rgb, minNe, maxNe); 3337//------------------------------------------------------------------------------------------------------------------------------ 3338 rW.rgb *= rW.rgb; 3339 #if (STP_POSTMAP == 0) 3340 StpToneInvMF3(rW.rgb); 3341 #endif // (STP_POSTMAP == 0) 3342 rF.a = rW.a = StpMF1(0.0); } 3343#endif // defined(STP_GPU) && defined(STP_TAA) && defined(STP_32BIT) 3344//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 3345//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 3346//_____________________________________________________________.._______________________________________________________________ 3347//============================================================================================================================== 3348// 16-BIT PATH 3349//============================================================================================================================== 3350#if defined(STP_GPU) && defined(STP_TAA) && defined(STP_16BIT) 3351 // Callbacks. 3352 // Gather4 of GEAA control data. 3353 StpH4 StpTaaCtl4H(StpF2 p); 3354//------------------------------------------------------------------------------------------------------------------------------ 3355 // Current frame {color,anti} input. 3356 // Gather4 specific channels. 3357 StpH4 StpTaaCol4RH(StpF2 p); 3358 StpH4 StpTaaCol4GH(StpF2 p); 3359 StpH4 StpTaaCol4BH(StpF2 p); 3360 StpH4 StpTaaCol4AH(StpF2 p); 3361//------------------------------------------------------------------------------------------------------------------------------ 3362 // Bilinear sampling of low-frequency convergence. 3363 StpH1 StpTaaConH(StpF2 p); 3364//------------------------------------------------------------------------------------------------------------------------------ 3365 // Dither value {0 to 1} this should be output pixel frequency spatial temporal blue noise. 3366 StpH1 StpTaaDitH(StpW2 o); 3367//------------------------------------------------------------------------------------------------------------------------------ 3368 // Gather4 current frame motion {z,x,y} packed input, same as the 32-bit version (just renamed). 3369 StpU4 StpTaaMot4H(StpF2 p); 3370//------------------------------------------------------------------------------------------------------------------------------ 3371 // Feedback {color, alpha}. 3372 // Bilinear fetch with clamp to edge. 3373 StpH4 StpTaaPriFedH(StpF2 p); 3374 // Gather4. 3375 StpH4 StpTaaPriFed4RH(StpF2 p); 3376 StpH4 StpTaaPriFed4GH(StpF2 p); 3377 StpH4 StpTaaPriFed4BH(StpF2 p); 3378 // Min/max sampling used for dering. 3379 #if STP_MAX_MIN_10BIT 3380 StpH4 StpTaaPriFedMaxH(StpF2 p); 3381 StpH4 StpTaaPriFedMinH(StpF2 p); 3382 #endif // STP_MAX_MIN_10BIT 3383 // Sampling with offsets. 3384 #if STP_OFFSETS 3385 StpH4 StpTaaPriFedOH(StpF2 p, StpI2 o); 3386 StpH4 StpTaaPriFed4ROH(StpF2 p, StpI2 o); 3387 StpH4 StpTaaPriFed4GOH(StpF2 p, StpI2 o); 3388 StpH4 StpTaaPriFed4BOH(StpF2 p, StpI2 o); 3389 #endif // STP_OFFSETS 3390//============================================================================================================================== 3391 void StpTaaH( 3392 StpW1 lane, // Currently unused but in the interface for possible future expansion. 3393 StpW2 o, // Integer pixel offset in output. 3394 out StpH4 rF, // Return Feedback (to be stored). 3395 out StpH4 rW, // Return Output (to be stored). 3396 StpU4 con0, // Constants generated by StpTaaCon(). 3397 StpU4 con1, 3398 StpU4 con2, 3399 StpU4 con3) { 3400//------------------------------------------------------------------------------------------------------------------------------ 3401 // This is only currently used for debug. 3402 StpH1 dit = StpTaaDitH(o); 3403//------------------------------------------------------------------------------------------------------------------------------ 3404 // Rename constants. 3405 StpF2 kCRcpF = StpF2_U2(con0.xy); 3406 StpF2 kHalfCRcpFUnjitC = StpF2_U2(con0.zw); 3407 StpF2 kRcpC = StpF2_U2(con1.xy); 3408 StpF2 kRcpF = StpF2_U2(con1.zw); 3409 StpF2 kHalfRcpF = StpF2_U2(con2.xy); 3410 StpF2 kJitCRcpC0 = StpF2_U2(con2.zw); 3411 StpF2 kHalfRcpC = StpF2_U2(con3.xy); 3412 StpF2 kF = StpF2_U2(con3.zw); 3413//------------------------------------------------------------------------------------------------------------------------------ 3414 // Check the streaming bandwidth limit. 3415 #if STP_BUG_BW_SOL 3416 { StpF2 oo = StpF2(o) * kRcpF; 3417 StpH4 g4 = StpTaaCtl4RH(oo); 3418 StpU4 m4 = StpTaaMot4H(oo); 3419 StpH1 cnv = StpTaaConH(oo); 3420 StpH4 f = StpTaaPriFedH(oo); 3421 StpH4 c4R = StpTaaCol4RH(oo); 3422 rW = rF = l4 + g4 + StpH4(m4) + StpH4_(cnv) + f + c4R; 3423 return; } 3424 #endif // STP_BUG_BW_SOL 3425//------------------------------------------------------------------------------------------------------------------------------ 3426 // Locate 2x2 neighborhood. 3427 // Float version of integer pixel offset in output. 3428 // All the 'o' prefixed variables are offset (aka position/coordinate) related. 3429 StpF2 oI = StpF2(o); 3430 // This gets to the center of the 2x2 quad directly because of possibility of shader/tex precision mismatch. 3431 // Precision mismatch could yield different 2x2 quads. 3432 StpF2 oC = oI * kCRcpF + kHalfCRcpFUnjitC; 3433 // NW of 2x2 quad. 3434 StpF2 oCNW = floor(oC + StpF2_(-0.5)); 3435 // Center of the 2x2 quad. 3436 StpF2 oC4 = oCNW * kRcpC + kRcpC; 3437 // Coordinates for low frequency convergence. 3438 StpF2 oC1 = oC * kRcpC; 3439//============================================================================================================================== 3440// FETCH {CONVERGENCE, COLOR, CONTROL, Z+MOTION} 3441//============================================================================================================================== 3442 // Fetch low-frequency convergence. 3443 StpH1 cnv = StpTaaConH(oC1); 3444 // Fetch color. 3445 StpH4 c4R = StpTaaCol4RH(oC4); 3446 StpH4 c4G = StpTaaCol4GH(oC4); 3447 StpH4 c4B = StpTaaCol4BH(oC4); 3448 StpH4 c4A = StpTaaCol4AH(oC4); 3449 // Control (GEAA weights) 3450 StpH4 g4 = StpTaaCtl4H(oC4); 3451 // Fetch {z,motion}. 3452 StpU4 m4 = StpTaaMot4H(oC4); 3453//------------------------------------------------------------------------------------------------------------------------------ 3454// INDEPENDENT 3455//------------------------------------------------------------------------------------------------------------------------------ 3456 // Setup resolve position {0 to 1} inside 2x2 quad. 3457 // The extra -0.5 is to get from NW position to center. 3458 StpH2 rP = StpH2(oC - oCNW) - StpH2_(0.5); 3459//------------------------------------------------------------------------------------------------------------------------------ 3460 // The 'rP' is resolve position {0 to 1} inside 2x2 quad, this is distance to ends of 2x2. 3461 // Instead of using {a,a-1} this uses {a,1-a} for reuse with the simple angular filtering. 3462 StpH2 rPX10 = StpH2(1.0, 0.0) + StpH2(-rP.x, rP.x); 3463 StpH2 rPY01 = StpH2(0.0, 1.0) + StpH2(rP.y, -rP.y); 3464 // Distance^2 {0 := on, 1 := off}. 3465 StpH4 pen4x = StpH4(rPX10.g, rPX10.r, rPX10.r, rPX10.g); 3466 StpH4 pen4y = StpH4(rPY01.g, rPY01.g, rPY01.r, rPY01.r); 3467 // Pen starts with distance squared to all 2x2 points. 3468 StpH4 pen4 = StpSatH4(pen4x * pen4x + pen4y * pen4y); 3469//============================================================================================================================== 3470// DEPENDENT ON {CONVERGENCE} 3471//============================================================================================================================== 3472 // Low frequency convergence keeps the next frame value, so subtract one frame. 3473 cnv = StpSatH1(cnv - StpH1_(1.0 / STP_FRAME_MAX)); 3474//------------------------------------------------------------------------------------------------------------------------------ 3475 // Pen size based on convergence. 3476 StpH1 pen = StpH1_(cnv) * StpH1_(STP_FRAME_MAX) + StpH1_(1.0); 3477 pen = StpPrxLoSqrtH1(pen); 3478 pen4 = StpSatH4(StpH4_(1.0) - pen4 * StpH4_(pen)); 3479 #if defined(STP_16BIT) 3480 StpH2 pen2 = pen4.xy * pen4.xy + pen4.zw * pen4.zw; 3481 pen = StpSatH1(pen2.x + pen2.y); 3482 #else // defined(STP_16BIT) 3483 pen = StpSatMF1(pen4.x * pen4.x + pen4.y * pen4.y + pen4.z * pen4.z + pen4.w * pen4.w); 3484 #endif // defined(STP_16BIT) 3485//============================================================================================================================== 3486// DEPENDENT ON {COLOR} 3487//============================================================================================================================== 3488 // Simple angular filtering (gets rid of block artifacts, adds sawtooth artifacts which are not a problem in practice). 3489 // Create a GEAA based weighting for no temporal feedback case. 3490 StpH4 wG; 3491 // Selects between either (S) or (T). 3492 // (S) A--B ... (T) A--B 3493 // |\ | | /| 3494 // | \| |/ | 3495 // R--G R--G 3496 // S and T only use the other diagonal. 3497 // Exact luma not required. 3498 StpH4 l4 = c4R + c4G * StpH4_(2.0) + c4B; 3499 StpH2 difST = abs(l4.gr - l4.ab); 3500 // Choose configuration based on which difference is maximum. 3501 StpP1 useS = difST.x > difST.y; 3502 // Choose interpolation weights given the configuration. 3503 // _T__________ _S__________ 3504 // R | sat( -x+ y) min(1-x, y) = y-G 3505 // G | min( x, y) sat(x-1+ y) = y-R 3506 // B | sat( x- y) min( x,1-y) = (1-y)-A 3507 // A | min(1-x,1-y) sat(1-x- y) = (1-y)-B 3508 // Difference between S and T is a {x} vs {1-x} and a RGBA vs GRAB swap. 3509 StpH2 wTrb = StpSatH2(StpH2(-rP.x, rP.x) + StpH2(rP.y, -rP.y)); 3510 StpH2 wSrb = min(rPX10, rPY01); 3511 if(useS) wTrb = wSrb; 3512 StpH2 wTga = rPY01 - wTrb; 3513 wG.rg = StpH2(wTrb.x, wTga.x); 3514 wG.ba = StpH2(wTrb.y, wTga.y); 3515 // Shaping is needed to get good high area scaling (remove the transition region). 3516 wG *= wG; 3517 wG *= wG; 3518//------------------------------------------------------------------------------------------------------------------------------ 3519 // Scale directional interpolation weights by GEAA weights to introduce anti-aliasing. 3520 wG *= g4; 3521 // Triangular nearest. 3522 // This works by removing the corner which contributes the least to the spatial interpolated result. 3523 StpH4 triMask = StpH4_(1.0); 3524 StpH2 wGmin2 = min(wG.xy, wG.zw); 3525//============================================================================================================================== 3526// DEPENDENT ON {Z,MOTION} 3527//============================================================================================================================== 3528 // This overwrites gather4 results. 3529 if(wGmin2.x < wGmin2.y) { 3530 if(wG.x < wG.z) { triMask.x = StpH1_(STP_TAA_TRI_MASK_AVOID); m4.x = 0xFFFFFFFF; } 3531 else { triMask.z = StpH1_(STP_TAA_TRI_MASK_AVOID); m4.z = 0xFFFFFFFF; } } 3532 else { 3533 if(wG.y < wG.w) { triMask.y = StpH1_(STP_TAA_TRI_MASK_AVOID); m4.y = 0xFFFFFFFF; } 3534 else { triMask.w = StpH1_(STP_TAA_TRI_MASK_AVOID); m4.w = 0xFFFFFFFF; } } 3535 StpU1 m1 = min(StpMin3U1(m4.x, m4.y, m4.z), m4.w); 3536//------------------------------------------------------------------------------------------------------------------------------ 3537 // Want to consume 'triMask' to free up register space. 3538 wG *= triMask; 3539//------------------------------------------------------------------------------------------------------------------------------ 3540 StpF2 mXY; 3541 // Motion 'm' units are {1 := move by one screen}. 3542 StpMvUnpackV(mXY, m1); 3543//============================================================================================================================== 3544// GET ALL FEEDBACK FILTERING DONE 3545//============================================================================================================================== 3546 // This region of code will have the highest register pressure in some configs, so doing as early as possible. 3547 // Setup for fetch feedback. 3548 StpF2 oF = oI * kRcpF + kHalfRcpF - mXY; 3549//------------------------------------------------------------------------------------------------------------------------------ 3550 StpH3 f; 3551 // Lanczos common. 3552 #if STP_TAA_PRX_LANCZOS 3553 // Motion reprojection position in feedback pixels. 3554 StpF2 oM = oI + StpF2_(0.5) - mXY * kF; 3555 // NW of center 2x2 quad. 3556 StpF2 oMNW = floor(oM + StpF2_(-0.5)); 3557 // Center of the center 2x2 quad. 3558 StpF2 oM4 = oMNW * kRcpF + kRcpF; 3559 StpH3 fMax, fMin; 3560 #else // STP_TAA_PRX_LANCZOS 3561 // Sample nearest feedback. 3562 f = StpTaaPriFedH(oF).rgb; 3563 #endif // STP_TAA_PRX_LANCZOS 3564//============================================================================================================================== 3565 #if (STP_TAA_PRX_LANCZOS == 1) 3566 // This one does a fixed 1x4 to try to cut cost in half relative to the complete 4x4. 3567 // It uses bilinear sampling on the 'x'. 3568 // Lanczos on the 'y' because most floating camera motion is 'y' based. 3569 // Fetch {feedback}. 3570 #if STP_OFFSETS 3571 // TODO: Can optimize out the 'oM4.y' add with constant change. 3572 StpF2 oM0 = StpF2(oF.x, oM4.y + kRcpF.y * StpF1_(-1.5)); 3573 StpH3 f0 = StpTaaPriFedH(oM0).rgb; 3574 StpH3 f1 = StpTaaPriFedOH(oM0, StpI2(0, 1)).rgb; 3575 StpH3 f2 = StpTaaPriFedOH(oM0, StpI2(0, 2)).rgb; 3576 StpH3 f3 = StpTaaPriFedOH(oM0, StpI2(0, 3)).rgb; 3577 #else // STP_OFFSETS 3578 StpF2 oM0 = StpF2(oF.x, oM4.y + kRcpF.y * StpF1_(-1.5)); 3579 StpF2 oM1 = StpF2(oF.x, oM4.y + kRcpF.y * StpF1_(-0.5)); 3580 StpF2 oM2 = StpF2(oF.x, oM4.y + kRcpF.y * StpF1_( 0.5)); 3581 StpF2 oM3 = StpF2(oF.x, oM4.y + kRcpF.y * StpF1_( 1.5)); 3582 StpH3 f0 = StpTaaPriFedH(oM0).rgb; 3583 StpH3 f1 = StpTaaPriFedH(oM1).rgb; 3584 StpH3 f2 = StpTaaPriFedH(oM2).rgb; 3585 StpH3 f3 = StpTaaPriFedH(oM3).rgb; 3586 #endif // STP_OFFSETS 3587 // Want this last because it's used last. 3588 #if (STP_MAX_MIN_10BIT && STP_TAA_PRX_LANCZOS_DERING) 3589 fMax = StpTaaPriFedMaxH(oM4).rgb; 3590 fMin = StpTaaPriFedMinH(oM4).rgb; 3591 #endif // (STP_MAX_MIN_10BIT && STP_TAA_PRX_LANCZOS_DERING) 3592 #if ((STP_MAX_MIN_10BIT == 0) && STP_TAA_PRX_LANCZOS_DERING) 3593 // Without {min,max} sampling, must gather4. 3594 StpH4 f4R = StpTaaPriFed4RH(oM4); 3595 StpH4 f4G = StpTaaPriFed4GH(oM4); 3596 StpH4 f4B = StpTaaPriFed4BH(oM4); 3597 #endif // ((STP_MAX_MIN_10BIT == 0) && STP_TAA_PRX_LANCZOS_DERING) 3598//------------------------------------------------------------------------------------------------------------------------------ 3599// INDEPENDENT 3600//------------------------------------------------------------------------------------------------------------------------------ 3601 // Convert to approximate lanczos weights. 3602 // Feedback position {0 to 1} inside 2x2 quad + 0.5. 3603 StpH2 fP = StpH2(oM - oMNW); 3604 // Convert to approximate lanczos weights. 3605 // This converts {-2 to 2} to {-1 to 1} because the kernel approximation is written for {-1 to 1}. 3606 StpH4 fPY = StpH4_(-fP.y * StpH1_(0.5)) + StpH4(-0.5 * 0.5, 0.5 * 0.5, 1.5 * 0.5, 2.5 * 0.5); 3607 // Weights in one axis. 3608 fPY = StpSatH4(StpH4_(1.0) - fPY * fPY); 3609 fPY *= fPY; 3610 StpH4 fPY4 = fPY * fPY; 3611 // ^6 (slightly more negative lobe than lanczos 2, slightly less expensive) 3612 fPY = (StpH4_(1.0 + 81.0 / 175.0) * fPY4 - StpH4_(81.0 / 175.0)) * fPY; 3613 #if defined(STP_16BIT) 3614 StpH2 fRcp2 = fPY.rg + fPY.ba; 3615 StpH1 fRcp = StpPrxLoRcpH1(fRcp2.x + fRcp2.y); 3616 #else // defined(STP_16BIT) 3617 StpMF1 fRcp = StpPrxLoRcpMF1(fPY.r + fPY.g + fPY.b + fPY.a); 3618 #endif // defined(STP_16BIT) 3619//------------------------------------------------------------------------------------------------------------------------------ 3620// DEPENDENT 3621//------------------------------------------------------------------------------------------------------------------------------ 3622 f.rgb = f0 * StpH3_(fPY.r) + f1 * StpH3_(fPY.g) + f2 * StpH3_(fPY.b) + f3 * StpH3_(fPY.a); 3623 f.rgb *= StpH3_(fRcp); 3624 #if STP_TAA_PRX_LANCZOS_DERING 3625 #if (STP_MAX_MIN_10BIT == 0) 3626 #if defined(STP_16BIT) 3627 StpH2 fXnyR = max(max(StpH2(f4R.x, -f4R.x), StpH2(f4R.y, -f4R.y)), 3628 max(StpH2(f4R.z, -f4R.z), StpH2(f4R.w, -f4R.w))); 3629 StpH2 fXnyG = max(max(StpH2(f4G.x, -f4G.x), StpH2(f4G.y, -f4G.y)), 3630 max(StpH2(f4G.z, -f4G.z), StpH2(f4G.w, -f4G.w))); 3631 StpH2 fXnyB = max(max(StpH2(f4B.x, -f4B.x), StpH2(f4B.y, -f4B.y)), 3632 max(StpH2(f4B.z, -f4B.z), StpH2(f4B.w, -f4B.w))); 3633 f = clamp(f, StpH3(-fXnyR.y, -fXnyG.y, -fXnyB.y), StpH3(fXnyR.x, fXnyG.x, fXnyB.x)); 3634 #else // defined(STP_16BIT) 3635 fMax.r = max(StpMax3H1(f4R.x, f4R.y, f4R.z), f4R.w); 3636 fMax.g = max(StpMax3H1(f4G.x, f4G.y, f4G.z), f4G.w); 3637 fMax.b = max(StpMax3H1(f4B.x, f4B.y, f4B.z), f4B.w); 3638 fMin.r = min(StpMin3H1(f4R.x, f4R.y, f4R.z), f4R.w); 3639 fMin.g = min(StpMin3H1(f4G.x, f4G.y, f4G.z), f4G.w); 3640 fMin.b = min(StpMin3H1(f4B.x, f4B.y, f4B.z), f4B.w); 3641 f = clamp(f, fMin, fMax); 3642 #endif // defined(STP_16BIT) 3643 #else // (STP_MAX_MIN_10BIT == 0) 3644 // Leaning on {min,max} sampling so no 16/32-bit permutation. 3645 f = clamp(f, fMin, fMax); 3646 #endif // (STP_MAX_MIN_10BIT == 0) 3647 #endif // STP_TAA_PRX_LANCZOS_DERING 3648 #endif // (STP_TAA_PRX_LANCZOS == 1) 3649//============================================================================================================================== 3650 #if (STP_TAA_PRX_LANCZOS == 2) 3651 // Unstable approximate lanczos feedback, full 4x4. 3652 // a = saturate(1-x*x) 3653 // u = 1+v 3654 // v = moves the zero crossing to 0.5 3655 // w = adjusts the shape 3656 // u*a^w - v*a^2 3657 // Fetch {feedback}. 3658 // 0w 0z 1w 1z | R 3659 // 0x 0y 1x 1y | G 3660 // 2w 2z 3w 3z | B 3661 // 2x 2y 3x 3y | A 3662 // -- -- -- -- 3663 // R G B A 3664 #if STP_OFFSETS 3665 StpH4 f4R0 = StpTaaPriFed4ROH(oM4, StpI2(-1, -1)); 3666 StpH4 f4G0 = StpTaaPriFed4GOH(oM4, StpI2(-1, -1)); 3667 StpH4 f4B0 = StpTaaPriFed4BOH(oM4, StpI2(-1, -1)); 3668 StpH4 f4R1 = StpTaaPriFed4ROH(oM4, StpI2( 1, -1)); 3669 StpH4 f4G1 = StpTaaPriFed4GOH(oM4, StpI2( 1, -1)); 3670 StpH4 f4B1 = StpTaaPriFed4BOH(oM4, StpI2( 1, -1)); 3671 StpH4 f4R2 = StpTaaPriFed4ROH(oM4, StpI2(-1, 1)); 3672 StpH4 f4G2 = StpTaaPriFed4GOH(oM4, StpI2(-1, 1)); 3673 StpH4 f4B2 = StpTaaPriFed4BOH(oM4, StpI2(-1, 1)); 3674 StpH4 f4R3 = StpTaaPriFed4ROH(oM4, StpI2( 1, 1)); 3675 StpH4 f4G3 = StpTaaPriFed4GOH(oM4, StpI2( 1, 1)); 3676 StpH4 f4B3 = StpTaaPriFed4BOH(oM4, StpI2( 1, 1)); 3677 #else // STP_OFFSETS 3678 StpF2 oM0 = oM4 + StpF2(-kRcpF.x, -kRcpF.y); 3679 StpF2 oM1 = oM4 + StpF2( kRcpF.x, -kRcpF.y); 3680 StpF2 oM2 = oM4 + StpF2(-kRcpF.x, kRcpF.y); 3681 StpF2 oM3 = oM4 + StpF2( kRcpF.x, kRcpF.y); 3682 StpH4 f4R0 = StpTaaPriFed4RH(oM0); 3683 StpH4 f4G0 = StpTaaPriFed4GH(oM0); 3684 StpH4 f4B0 = StpTaaPriFed4BH(oM0); 3685 StpH4 f4R1 = StpTaaPriFed4RH(oM1); 3686 StpH4 f4G1 = StpTaaPriFed4GH(oM1); 3687 StpH4 f4B1 = StpTaaPriFed4BH(oM1); 3688 StpH4 f4R2 = StpTaaPriFed4RH(oM2); 3689 StpH4 f4G2 = StpTaaPriFed4GH(oM2); 3690 StpH4 f4B2 = StpTaaPriFed4BH(oM2); 3691 StpH4 f4R3 = StpTaaPriFed4RH(oM3); 3692 StpH4 f4G3 = StpTaaPriFed4GH(oM3); 3693 StpH4 f4B3 = StpTaaPriFed4BH(oM3); 3694 #endif // STP_OFFSETS 3695 // Want this last because it's used last. 3696 #if (STP_MAX_MIN_10BIT && STP_TAA_PRX_LANCZOS_DERING) 3697 fMax = StpTaaPriFedMaxH(oM4).rgb; 3698 fMin = StpTaaPriFedMinH(oM4).rgb; 3699 #endif // (STP_MAX_MIN_10BIT && STP_TAA_PRX_LANCZOS_DERING) 3700//------------------------------------------------------------------------------------------------------------------------------ 3701// INDEPENDENT 3702//------------------------------------------------------------------------------------------------------------------------------ 3703 // Feedback position {0 to 1} inside 2x2 quad + 0.5. 3704 StpH2 fP = StpH2(oM - oMNW); 3705 // Convert to approximate lanczos weights. 3706 // This converts {-2 to 2} to {-1 to 1} because the kernel approximation is written for {-1 to 1}. 3707 StpH4 fPX = StpH4_(-fP.x * StpH1_(0.5)) + StpH4(-0.5 * 0.5, 0.5 * 0.5, 1.5 * 0.5, 2.5 * 0.5); 3708 StpH4 fPY = StpH4_(-fP.y * StpH1_(0.5)) + StpH4(-0.5 * 0.5, 0.5 * 0.5, 1.5 * 0.5, 2.5 * 0.5); 3709 // Weights in both axis. 3710 fPX = StpSatH4(StpH4_(1.0) - fPX * fPX); 3711 fPY = StpSatH4(StpH4_(1.0) - fPY * fPY); 3712 fPX *= fPX; 3713 fPY *= fPY; 3714 StpH4 fPX4 = fPX * fPX; 3715 StpH4 fPY4 = fPY * fPY; 3716 // ^6 (slightly more negative lobe than lanczos 2, slightly less expensive) 3717 fPX = (StpH4_(1.0 + 81.0 / 175.0) * fPX4 - StpH4_(81.0 / 175.0)) * fPX; 3718 fPY = (StpH4_(1.0 + 81.0 / 175.0) * fPY4 - StpH4_(81.0 / 175.0)) * fPY; 3719 #if defined(STP_16BIT) 3720 StpH2 fRcpX = fPX.rg + fPX.ba; 3721 StpH2 fRcpY = fPY.rg + fPY.ba; 3722 fPX *= StpH4_(StpPrxLoRcpH1(fRcpX.r + fRcpX.y)); 3723 fPY *= StpH4_(StpPrxLoRcpH1(fRcpY.r + fRcpY.y)); 3724 #else // defined(STP_16BIT) 3725 fPX *= StpMF4_(StpPrxLoRcpMF1(fPX.r + fPX.g + fPX.b + fPX.a)); 3726 fPY *= StpMF4_(StpPrxLoRcpMF1(fPY.r + fPY.g + fPY.b + fPY.a)); 3727 #endif // defined(STP_16BIT) 3728 StpH4 fPX0 = fPX * StpH4_(fPY.r); 3729 StpH4 fPX1 = fPX * StpH4_(fPY.g); 3730 StpH4 fPX2 = fPX * StpH4_(fPY.b); 3731 StpH4 fPX3 = fPX * StpH4_(fPY.a); 3732//------------------------------------------------------------------------------------------------------------------------------ 3733// DEPENDENT 3734//------------------------------------------------------------------------------------------------------------------------------ 3735 #if defined(STP_16BIT) 3736 StpH2 fR2 = f4R0.wz * fPX0.xy + f4R1.wz * fPX0.zw + f4R0.xy * fPX1.xy + f4R1.xy * fPX1.zw + 3737 f4R2.wz * fPX2.xy + f4R3.wz * fPX2.zw + f4R2.xy * fPX3.xy + f4R3.xy * fPX3.zw; 3738 StpH2 fG2 = f4G0.wz * fPX0.xy + f4G1.wz * fPX0.zw + f4G0.xy * fPX1.xy + f4G1.xy * fPX1.zw + 3739 f4G2.wz * fPX2.xy + f4G3.wz * fPX2.zw + f4G2.xy * fPX3.xy + f4G3.xy * fPX3.zw; 3740 StpH2 fB2 = f4B0.wz * fPX0.xy + f4B1.wz * fPX0.zw + f4B0.xy * fPX1.xy + f4B1.xy * fPX1.zw + 3741 f4B2.wz * fPX2.xy + f4B3.wz * fPX2.zw + f4B2.xy * fPX3.xy + f4B3.xy * fPX3.zw; 3742 f = StpH3(fR2.x + fR2.y, fG2.x + fG2.y, fB2.x + fB2.y); 3743 #else // defined(STP_16BIT) 3744 f.r = f4R0.w * fPX0.r + f4R0.z * fPX0.g + f4R1.w * fPX0.b + f4R1.z * fPX0.a + 3745 f4R0.x * fPX1.r + f4R0.y * fPX1.g + f4R1.x * fPX1.b + f4R1.y * fPX1.a + 3746 f4R2.w * fPX2.r + f4R2.z * fPX2.g + f4R3.w * fPX2.b + f4R3.z * fPX2.a + 3747 f4R2.x * fPX3.r + f4R2.y * fPX3.g + f4R3.x * fPX3.b + f4R3.y * fPX3.a; 3748 f.g = f4G0.w * fPX0.r + f4G0.z * fPX0.g + f4G1.w * fPX0.b + f4G1.z * fPX0.a + 3749 f4G0.x * fPX1.r + f4G0.y * fPX1.g + f4G1.x * fPX1.b + f4G1.y * fPX1.a + 3750 f4G2.w * fPX2.r + f4G2.z * fPX2.g + f4G3.w * fPX2.b + f4G3.z * fPX2.a + 3751 f4G2.x * fPX3.r + f4G2.y * fPX3.g + f4G3.x * fPX3.b + f4G3.y * fPX3.a; 3752 f.b = f4B0.w * fPX0.r + f4B0.z * fPX0.g + f4B1.w * fPX0.b + f4B1.z * fPX0.a + 3753 f4B0.x * fPX1.r + f4B0.y * fPX1.g + f4B1.x * fPX1.b + f4B1.y * fPX1.a + 3754 f4B2.w * fPX2.r + f4B2.z * fPX2.g + f4B3.w * fPX2.b + f4B3.z * fPX2.a + 3755 f4B2.x * fPX3.r + f4B2.y * fPX3.g + f4B3.x * fPX3.b + f4B3.y * fPX3.a; 3756 #endif // defined(STP_16BIT) 3757 #if STP_TAA_PRX_LANCZOS_DERING 3758 #if (STP_MAX_MIN_10BIT == 0) 3759 #if defined(STP_16BIT) 3760 StpH2 fXnyR = max(max(StpH2(f4R0.y, -f4R0.y), StpH2(f4R1.x, -f4R1.x)), 3761 max(StpH2(f4R2.z, -f4R2.z), StpH2(f4R3.w, -f4R3.w))); 3762 StpH2 fXnyG = max(max(StpH2(f4G0.y, -f4G0.y), StpH2(f4G1.x, -f4G1.x)), 3763 max(StpH2(f4G2.z, -f4G2.z), StpH2(f4G3.w, -f4G3.w))); 3764 StpH2 fXnyB = max(max(StpH2(f4B0.y, -f4B0.y), StpH2(f4B1.x, -f4B1.x)), 3765 max(StpH2(f4B2.z, -f4B2.z), StpH2(f4B3.w, -f4B3.w))); 3766 f = clamp(f, StpH3(-fXnyR.y, -fXnyG.y, -fXnyB.y), StpH3(fXnyR.x, fXnyG.x, fXnyB.x)); 3767 #else // defined(STP_16BIT) 3768 fMax.r = max(StpMax3H1(f4R0.y, f4R1.x, f4R2.z), f4R3.w); 3769 fMax.g = max(StpMax3H1(f4G0.y, f4G1.x, f4G2.z), f4G3.w); 3770 fMax.b = max(StpMax3H1(f4B0.y, f4B1.x, f4B2.z), f4B3.w); 3771 fMin.r = min(StpMin3H1(f4R0.y, f4R1.x, f4R2.z), f4R3.w); 3772 fMin.g = min(StpMin3H1(f4G0.y, f4G1.x, f4G2.z), f4G3.w); 3773 fMin.b = min(StpMin3H1(f4B0.y, f4B1.x, f4B2.z), f4B3.w); 3774 f = clamp(f, fMin, fMax); 3775 #endif // defined(STP_16BIT) 3776 #else // (STP_MAX_MIN_10BIT == 0) 3777 // Leaning on {min,max} sampling so no 16/32-bit permutation. 3778 f = clamp(f, fMin, fMax); 3779 #endif // (STP_MAX_MIN_10BIT == 0) 3780 #endif // STP_TAA_PRX_LANCZOS_DERING 3781 #endif // (STP_TAA_PRX_LANCZOS == 2) 3782//============================================================================================================================== 3783// DISPLACEMENT 3784//============================================================================================================================== 3785 // Note the 'kJitCRcpC0' gets to position 0 to save some runtime maths. 3786 // 3 2 3787 // 0 1 3788 StpF2 oD0 = oC4 + kJitCRcpC0 - mXY; 3789 StpF2 oD1 = StpF2(kRcpC.x, 0.0) + oD0; 3790 StpF2 oD2 = StpF2(kRcpC.x, -kRcpC.y) + oD0; 3791 StpF2 oD3 = StpF2(0.0, -kRcpC.y) + oD0; 3792 StpH3 d0 = StpTaaPriFedH(oD0).rgb; 3793 StpH3 d1 = StpTaaPriFedH(oD1).rgb; 3794 StpH3 d2 = StpTaaPriFedH(oD2).rgb; 3795 StpH3 d3 = StpTaaPriFedH(oD3).rgb; 3796//------------------------------------------------------------------------------------------------------------------------------ 3797// INDEPENDENT 3798//------------------------------------------------------------------------------------------------------------------------------ 3799 // Normalize interpolation weights. 3800 #if defined(STP_16BIT) 3801 StpH2 wG2 = wG.xy + wG.zw; 3802 wG = StpSatH4(wG * StpH4_(StpPrxLoRcpH1(wG2.x + wG2.y))); 3803 #else // defined(STP_16BIT) 3804 wG = StpSatMF4(wG * StpMF4_(StpPrxLoRcpMF1(wG.x + wG.y + wG.z + wG.w))); 3805 #endif // defined(STP_16BIT) 3806//------------------------------------------------------------------------------------------------------------------------------ 3807 // Temporal weighting. 3808 StpH4 wT = abs(c4R - StpH4_(f.r)) * StpH4_(STP_LUMA_R) + 3809 abs(c4G - StpH4_(f.g)) * StpH4_(STP_LUMA_G) + 3810 abs(c4B - StpH4_(f.b)) * StpH4_(STP_LUMA_B); 3811 wT = StpPrxLoRcpH4(wT * StpH4_(STP_ANTI_MAX) + StpH4_(STP_ANTI_MIN)) * triMask; 3812//------------------------------------------------------------------------------------------------------------------------------ 3813 #if defined(STP_16BIT) 3814 StpH2 wT2 = wT.xy + wT.zw; 3815 wT = StpSatH4(wT * StpH4_(StpPrxLoRcpH1(wT2.x + wT2.y))); 3816 #else // defined(STP_16BIT) 3817 wT = StpSatMF4(wT * StpMF4_(StpPrxLoRcpMF1(wT.x + wT.y + wT.z + wT.w))); 3818 #endif // defined(STP_16BIT) 3819//------------------------------------------------------------------------------------------------------------------------------ 3820 // Interpolate match. 3821 // Using a fixed 50/50 split of two normalized weights yields a normalized weight. 3822 StpH4 wM = wT * StpH4_(0.5) + wG * StpH4_(0.5); 3823 #if defined(STP_16BIT) 3824 StpH2 match2 = (c4A.xy * wM.xy) + (c4A.zw * wM.zw); 3825 StpH1 match = match2.x + match2.y; 3826 #else // defined(STP_16BIT) 3827 StpMF1 match = c4A.x * wM.x + c4A.y * wM.y + c4A.z * wM.z + c4A.w * wM.w; 3828 #endif // defined(STP_16BIT) 3829 // Non-motion-match kills convergence for this frame only. 3830 cnv *= match; 3831//------------------------------------------------------------------------------------------------------------------------------ 3832// DEPENDENT 3833//------------------------------------------------------------------------------------------------------------------------------ 3834 // Interpolation, this first section doesn't have gather4, so probably no gain in swizzling. 3835 StpH3 dG = d0 * StpH3_(wG.x) + d1 * StpH3_(wG.y) + d2 * StpH3_(wG.z) + d3 * StpH3_(wG.w); 3836 StpH3 dT = d0 * StpH3_(wT.x) + d1 * StpH3_(wT.y) + d2 * StpH3_(wT.z) + d3 * StpH3_(wT.w); 3837//------------------------------------------------------------------------------------------------------------------------------ 3838 #if defined(STP_16BIT) 3839 StpH2 t2R = (c4R.xy * wT.xy) + (c4R.zw * wT.zw); 3840 StpH2 t2G = (c4G.xy * wT.xy) + (c4G.zw * wT.zw); 3841 StpH2 t2B = (c4B.xy * wT.xy) + (c4B.zw * wT.zw); 3842 StpH3 t = StpH3(t2R.x + t2R.y, t2G.x + t2G.y, t2B.x + t2B.y); 3843 StpH2 c2R = (c4R.xy * wG.xy) + (c4R.zw * wG.zw); 3844 StpH2 c2G = (c4G.xy * wG.xy) + (c4G.zw * wG.zw); 3845 StpH2 c2B = (c4B.xy * wG.xy) + (c4B.zw * wG.zw); 3846 StpH3 c = StpH3(c2R.x + c2R.y, c2G.x + c2G.y, c2B.x + c2B.y); 3847 #else // defined(STP_16BIT) 3848 StpMF3 t = StpMF3( 3849 c4R.x * wT.x + c4R.y * wT.y + c4R.z * wT.z + c4R.w * wT.w, 3850 c4G.x * wT.x + c4G.y * wT.y + c4G.z * wT.z + c4G.w * wT.w, 3851 c4B.x * wT.x + c4B.y * wT.y + c4B.z * wT.z + c4B.w * wT.w); 3852 StpMF3 c = StpMF3( 3853 c4R.x * wG.x + c4R.y * wG.y + c4R.z * wG.z + c4R.w * wG.w, 3854 c4G.x * wG.x + c4G.y * wG.y + c4G.z * wG.z + c4G.w * wG.w, 3855 c4B.x * wG.x + c4B.y * wG.y + c4B.z * wG.z + c4B.w * wG.w); 3856 #endif // defined(STP_16BIT) 3857//------------------------------------------------------------------------------------------------------------------------------ 3858 // Neighborhood. 3859 StpH1 bln = StpSatH1(cnv * StpPrxLoRcpH1(cnv + StpH1_(1.0 / STP_FRAME_MAX))); 3860 StpH1 blnT = StpH1_(1.0) - bln; 3861 StpH3 b = f * StpH3_(bln) + t * StpH3_(blnT); 3862 StpH3 minNe = min(c, b); 3863 StpH3 maxNe = max(c, b); 3864//------------------------------------------------------------------------------------------------------------------------------ 3865 // Apply pen. 3866 StpH3 penC = StpSatH3(c + (f - dG) * StpH3_(StpH1_(0.9875) * match)); 3867 StpH2 penWF; 3868 penWF.x = pen * StpH1_(STP_TAA_PEN_W); 3869 penWF.y = pen * lerp(StpH1_(STP_TAA_PEN_F0), StpH1_(STP_TAA_PEN_F1), cnv); 3870 StpH2 penNotWF = StpH2_(1.0) - penWF; 3871 rF.rgb = t + (f - dT); 3872 rF.rgb = rF.rgb * StpH3_(blnT) + f * StpH3_(bln); 3873 rW.rgb = StpSatH3(rF.rgb * StpH3_(penNotWF.x) + penC * StpH3_(penWF.x)); 3874 rF.rgb = StpSatH3(rF.rgb * StpH3_(penNotWF.y) + penC * StpH3_(penWF.y)); 3875 rW.rgb = clamp(rW.rgb, minNe, maxNe); 3876 rF.rgb = clamp(rF.rgb, minNe, maxNe); 3877//------------------------------------------------------------------------------------------------------------------------------ 3878 // Get back into linear, and then HDR. 3879 rW.rgb *= rW.rgb; 3880 #if (STP_POSTMAP == 0) 3881 StpToneInvH3(rW.rgb); 3882 #endif // (STP_POSTMAP == 0) 3883 // Alpha is currently unused, this might improve compression (vs undefined). 3884 rF.a = rW.a = StpH1(0.0); } 3885#endif // defined(STP_GPU) && defined(STP_TAA) && defined(STP_16BIT) 3886//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 3887//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 3888//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 3889//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 3890//_____________________________________________________________.._______________________________________________________________ 3891//============================================================================================================================== 3892// 3893// GOOD ENOUGH ANTI-ALIASING [GEAA] 3894// 3895//------------------------------------------------------------------------------------------------------------------------------ 3896// Yet another simplified spatial morphological AA. 3897// Not perfect, but it has low complexity (one pass), and is good enough for a TAA override. 3898// Fails on longer edges (due to low maximum search), doesn't get diagonals perfect. 3899// But good on already part AA'ed inputs. 3900// The spatial AA is not used in STP, only a weighting value which is later used to guide a quick-and-dirty scalar. 3901// With some modification this could be used for spatial AA, with or without scaling. 3902//------------------------------------------------------------------------------------------------------------------------------ 3903// CALLBACKS 3904// ========= 3905// StpMF4 StpGeaa4F(StpF2 p) - Gather4 of luma (or green as luma). 3906// --------- 3907// StpH4 StpGeaa4H(StpF2 p) 3908//============================================================================================================================== 3909//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 3910//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 3911//_____________________________________________________________.._______________________________________________________________ 3912//============================================================================================================================== 3913// [GEAA] DEFAULTS 3914//============================================================================================================================== 3915// Choose a configuration of number of positions to sample. 3916// 0 ... 3 per side (faster, less quality) 3917// 1 ... 5 per side 3918// 2 ... 7 per side 3919// 3 ... 9 per side (slower, higher quality) 3920#ifndef STP_GEAA_P 3921 #define STP_GEAA_P 3 3922#endif // STP_GEAA_P 3923//------------------------------------------------------------------------------------------------------------------------------ 3924// Amount of sub-pixel blur. 3925// 0.50 ... Turn it off 3926// 0.25 ... Middle ground 3927// 0.00 ... More blur 3928#ifndef STP_GEAA_SUBPIX 3929 #define STP_GEAA_SUBPIX (8.0 / 16.0) 3930#endif // STP_GEAA_SUBPIX 3931//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 3932//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 3933//_____________________________________________________________.._______________________________________________________________ 3934//============================================================================================================================== 3935// [GEAA] INTERNAL TUNING 3936//============================================================================================================================== 3937// Higher numbers can reduce the amount of AA, lower numbers can increase it but can look dirty. 3938// Best not to mess with this, 1/3 is the 'correct' value for 2 of the 3 edge cases. 3939#define STP_GEAA_THRESHOLD (1.0/3.0) 3940//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 3941//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 3942//_____________________________________________________________.._______________________________________________________________ 3943//============================================================================================================================== 3944// [GEAA] 32-BIT ENTRY POINT 3945//============================================================================================================================== 3946// See the 16-bit version for all comments. 3947#if defined(STP_GPU) && defined(STP_GEAA) && defined(STP_32BIT) 3948 void StpGeaaF( 3949 out StpMF1 gW, out StpMF1 gLuma, out StpF2 gFilter, out StpF2 gDilate, StpF2 p, StpF2 kRcpI, StpF2 kHalfRcpI) { 3950//------------------------------------------------------------------------------------------------------------------------------ 3951 #if STP_OFFSETS 3952 StpF2 pDEBA = p + StpF2(-kHalfRcpI.x, -kHalfRcpI.y); 3953 StpMF4 gDEBA = StpGeaa4F(pDEBA); 3954 StpMF4 gEFCB = StpGeaa4OF(pDEBA, StpI2(1, 0)); 3955 StpMF4 gGHED = StpGeaa4OF(pDEBA, StpI2(0, 1)); 3956 StpMF4 gHIFE = StpGeaa4OF(pDEBA, StpI2(1, 1)); 3957 #else // STP_OFFSETS 3958 StpMF4 gDEBA = StpGeaa4F(p + StpF2(-kHalfRcpI.x, -kHalfRcpI.y)); 3959 StpMF4 gEFCB = StpGeaa4F(p + StpF2( kHalfRcpI.x, -kHalfRcpI.y)); 3960 StpMF4 gGHED = StpGeaa4F(p + StpF2(-kHalfRcpI.x, kHalfRcpI.y)); 3961 StpMF4 gHIFE = StpGeaa4F(p + StpF2( kHalfRcpI.x, kHalfRcpI.y)); 3962 #endif // STP_OFFSETS 3963//------------------------------------------------------------------------------------------------------------------------------ 3964 StpMF2 gHV0,gHV1,gHV2; 3965 gHV0.x = gDEBA.z * StpMF1_(-2.0) + gEFCB.z; 3966 gHV0.y = gDEBA.x * StpMF1_(-2.0) + gGHED.x; 3967 gHV0 += StpMF2_(gDEBA.w); 3968 gHV1.x = gDEBA.x + gEFCB.y; 3969 gHV1.y = gDEBA.z + gGHED.y; 3970 gHV1 += StpMF2_(gDEBA.y) * StpMF2_(-2.0); 3971 gHV2.x = gGHED.x + gGHED.y * StpMF1_(-2.0); 3972 gHV2.y = gEFCB.z + gEFCB.y * StpMF1_(-2.0); 3973 gHV2 += StpMF2_(gHIFE.y); 3974 #if 0 3975 StpMF2 gHV = abs(gHV0) + abs(gHV1) * StpMF2_(2.0) + abs(gHV2); 3976 #else 3977 StpMF2 gHV = gHV0 * gHV0 + gHV1 * gHV1 * StpMF2_(2.0) + gHV2 * gHV2; 3978 #endif 3979 StpP1 gVert = gHV.x > gHV.y; 3980//------------------------------------------------------------------------------------------------------------------------------ 3981 StpMF2 gBH = gVert ? StpMF2(gDEBA.x, gEFCB.y) : StpMF2(gDEBA.z, gGHED.y); 3982 StpMF2 gAC = gVert ? StpMF2(gDEBA.w, gGHED.x) : StpMF2(gDEBA.w, gEFCB.z); 3983 StpMF2 gDF = gVert ? StpMF2(gDEBA.z, gGHED.y) : StpMF2(gDEBA.x, gEFCB.y); 3984 StpMF2 gGI = gVert ? StpMF2(gEFCB.y, gHIFE.y) : StpMF2(gGHED.x, gHIFE.y); 3985 StpMF2 gBHMinusE = gBH - StpMF2_(gDEBA.y); 3986 StpMF2 gEnd2 = abs(gBHMinusE); 3987 StpP1 gUp = gEnd2.x >= gEnd2.y; 3988//------------------------------------------------------------------------------------------------------------------------------ 3989 StpMF1 gE = gDEBA.y; 3990 gBH = gUp ? gBH : gBH.yx; 3991//------------------------------------------------------------------------------------------------------------------------------ 3992 StpMF2 gBi = gUp ? StpMF2(2.0 / 3.0, 1.0 / 3.0) : StpMF2(1.0 / 3.0 , 2.0 / 3.0); 3993 StpMF1 gBMinusE = gUp ? gBHMinusE.x : gBHMinusE.y; 3994 StpMF2 gBi0 = (gUp ? gAC : gGI) * StpMF2_(1.0 / 3.0) + gDF * StpMF2_(2.0 / 3.0); 3995 StpMF2 gLo0 = gDF; 3996 StpMF1 gAbsBMinusE = abs(gBMinusE); 3997 StpMF1 gNe = gAbsBMinusE; 3998 StpMF1 gGood = StpGtZeroMF1(gBMinusE); 3999//------------------------------------------------------------------------------------------------------------------------------ 4000 StpF2 gWalk = gVert ? StpF2(0.0, kRcpI.y) : StpF2(kRcpI.x, 0.0); 4001 StpF2 gDecon = gVert ? StpF2(kRcpI.x, 0.0) : StpF2(0.0, kRcpI.y); 4002 if(gUp) gDecon = -gDecon; 4003//------------------------------------------------------------------------------------------------------------------------------ 4004 StpF2 gP = p + gDecon * StpF2_(1.0/3.0); 4005//------------------------------------------------------------------------------------------------------------------------------ 4006 StpF2 gPN3 = gP - StpF2_(8.5) * gWalk; 4007 StpF2 gPN2 = gP - StpF2_(6.5) * gWalk; 4008 StpF2 gPN1 = gP - StpF2_(4.5) * gWalk; 4009 StpF2 gPN0 = gP - StpF2_(2.5) * gWalk; 4010 StpF2 gPP0 = gP + StpF2_(2.5) * gWalk; 4011 StpF2 gPP1 = gP + StpF2_(4.5) * gWalk; 4012 StpF2 gPP2 = gP + StpF2_(6.5) * gWalk; 4013 StpF2 gPP3 = gP + StpF2_(8.5) * gWalk; 4014//------------------------------------------------------------------------------------------------------------------------------ 4015 StpMF4 gGN3, gGN2, gGN1, gGN0, gGP0, gGP1, gGP2, gGP3; 4016 gGN3 = StpGeaa4F(gPN3); 4017 gGN2 = StpGeaa4F(gPN2); 4018 gGN1 = StpGeaa4F(gPN1); 4019 gGN0 = StpGeaa4F(gPN0); 4020 gGP0 = StpGeaa4F(gPP0); 4021 gGP1 = StpGeaa4F(gPP1); 4022 gGP2 = StpGeaa4F(gPP2); 4023 gGP3 = StpGeaa4F(gPP3); 4024//------------------------------------------------------------------------------------------------------------------------------ 4025 if(gVert) { 4026 gGN3 = gGN3.zyxw; 4027 gGN2 = gGN2.zyxw; 4028 gGN1 = gGN1.zyxw; 4029 gGN0 = gGN0.zyxw; 4030 gGP0 = gGP0.zyxw; 4031 gGP1 = gGP1.zyxw; 4032 gGP2 = gGP2.zyxw; 4033 gGP3 = gGP3.zyxw; } 4034//------------------------------------------------------------------------------------------------------------------------------ 4035 StpMF2 gLo8 = StpMF2(gGN3.x, gGP3.y); 4036 StpMF2 gLo7 = StpMF2(gGN3.y, gGP3.x); 4037 StpMF2 gLo6 = StpMF2(gGN2.x, gGP2.y); 4038 StpMF2 gLo5 = StpMF2(gGN2.y, gGP2.x); 4039 StpMF2 gLo4 = StpMF2(gGN1.x, gGP1.y); 4040 StpMF2 gLo3 = StpMF2(gGN1.y, gGP1.x); 4041 StpMF2 gLo2 = StpMF2(gGN0.x, gGP0.y); 4042 StpMF2 gLo1 = StpMF2(gGN0.y, gGP0.x); 4043 if(!gUp) { 4044 gLo8 = StpMF2(gGN3.w, gGP3.z); 4045 gLo7 = StpMF2(gGN3.z, gGP3.w); 4046 gLo6 = StpMF2(gGN2.w, gGP2.z); 4047 gLo5 = StpMF2(gGN2.z, gGP2.w); 4048 gLo4 = StpMF2(gGN1.w, gGP1.z); 4049 gLo3 = StpMF2(gGN1.z, gGP1.w); 4050 gLo2 = StpMF2(gGN0.w, gGP0.z); 4051 gLo1 = StpMF2(gGN0.z, gGP0.w); } 4052//------------------------------------------------------------------------------------------------------------------------------ 4053 StpMF2 gGN3Bi = gGN3.yx * StpMF2_(gBi.x) + gGN3.zw * StpMF2_(gBi.y); 4054 StpMF2 gGN2Bi = gGN2.yx * StpMF2_(gBi.x) + gGN2.zw * StpMF2_(gBi.y); 4055 StpMF2 gGN1Bi = gGN1.yx * StpMF2_(gBi.x) + gGN1.zw * StpMF2_(gBi.y); 4056 StpMF2 gGN0Bi = gGN0.yx * StpMF2_(gBi.x) + gGN0.zw * StpMF2_(gBi.y); 4057 StpMF2 gGP0Bi = gGP0.yx * StpMF2_(gBi.x) + gGP0.zw * StpMF2_(gBi.y); 4058 StpMF2 gGP1Bi = gGP1.yx * StpMF2_(gBi.x) + gGP1.zw * StpMF2_(gBi.y); 4059 StpMF2 gGP2Bi = gGP2.yx * StpMF2_(gBi.x) + gGP2.zw * StpMF2_(gBi.y); 4060 StpMF2 gGP3Bi = gGP3.yx * StpMF2_(gBi.x) + gGP3.zw * StpMF2_(gBi.y); 4061 StpMF2 gBi8 = StpMF2(gGN3Bi.y, gGP3Bi.x); 4062 StpMF2 gBi7 = StpMF2(gGN3Bi.x, gGP3Bi.y); 4063 StpMF2 gBi6 = StpMF2(gGN2Bi.y, gGP2Bi.x); 4064 StpMF2 gBi5 = StpMF2(gGN2Bi.x, gGP2Bi.y); 4065 StpMF2 gBi4 = StpMF2(gGN1Bi.y, gGP1Bi.x); 4066 StpMF2 gBi3 = StpMF2(gGN1Bi.x, gGP1Bi.y); 4067 StpMF2 gBi2 = StpMF2(gGN0Bi.y, gGP0Bi.x); 4068 StpMF2 gBi1 = StpMF2(gGN0Bi.x, gGP0Bi.y); 4069//------------------------------------------------------------------------------------------------------------------------------ 4070 StpMF2 gEndBase; 4071 gEndBase.y = gBMinusE * StpMF1_(1.0/3.0) + gE; 4072 gEndBase.x = gAbsBMinusE * StpMF1_(STP_GEAA_THRESHOLD); 4073 #if 0 4074 gEndBase.x = StpRcpMF1(max(StpMF1_(1.0 / 16384.0), gEndBase.x)); 4075 #else 4076 gEndBase.x = StpPrxLoRcpMF1(gEndBase.x); 4077 #endif 4078//------------------------------------------------------------------------------------------------------------------------------ 4079 #if (STP_GEAA_P > 2) 4080 StpMF2 gUseP8 = StpSatMF2(abs(gBi8 - StpMF2_(gEndBase.y)) * StpMF2_(gEndBase.x)); 4081 StpMF2 gUseP7 = StpSatMF2(abs(gBi7 - StpMF2_(gEndBase.y)) * StpMF2_(gEndBase.x)); 4082 #endif 4083 #if (STP_GEAA_P > 1) 4084 StpMF2 gUseP6 = StpSatMF2(abs(gBi6 - StpMF2_(gEndBase.y)) * StpMF2_(gEndBase.x)); 4085 StpMF2 gUseP5 = StpSatMF2(abs(gBi5 - StpMF2_(gEndBase.y)) * StpMF2_(gEndBase.x)); 4086 #endif 4087 #if (STP_GEAA_P > 0) 4088 StpMF2 gUseP4 = StpSatMF2(abs(gBi4 - StpMF2_(gEndBase.y)) * StpMF2_(gEndBase.x)); 4089 StpMF2 gUseP3 = StpSatMF2(abs(gBi3 - StpMF2_(gEndBase.y)) * StpMF2_(gEndBase.x)); 4090 #endif 4091 StpMF2 gUseP2 = StpSatMF2(abs(gBi2 - StpMF2_(gEndBase.y)) * StpMF2_(gEndBase.x)); 4092 StpMF2 gUseP1 = StpSatMF2(abs(gBi1 - StpMF2_(gEndBase.y)) * StpMF2_(gEndBase.x)); 4093 StpMF2 gUseP0 = StpSatMF2(abs(gBi0 - StpMF2_(gEndBase.y)) * StpMF2_(gEndBase.x)); 4094//------------------------------------------------------------------------------------------------------------------------------ 4095 #if (STP_GEAA_P == 3) 4096 StpMF2 gDst2 = StpMF2_(9.5); 4097 #endif 4098 #if (STP_GEAA_P == 2) 4099 StpMF2 gDst2 = StpMF2_(7.5); 4100 #endif 4101 #if (STP_GEAA_P == 1) 4102 StpMF2 gDst2 = StpMF2_(5.5); 4103 #endif 4104 #if (STP_GEAA_P == 0) 4105 StpMF2 gDst2 = StpMF2_(3.5); 4106 #endif 4107 #if (STP_GEAA_P > 2) 4108 gDst2 = gDst2 + (StpMF2_(8.5) - gDst2) * gUseP8; 4109 gDst2 = gDst2 + (StpMF2_(7.5) - gDst2) * gUseP7; 4110 #endif 4111 #if (STP_GEAA_P > 1) 4112 gDst2 = gDst2 + (StpMF2_(6.5) - gDst2) * gUseP6; 4113 gDst2 = gDst2 + (StpMF2_(5.5) - gDst2) * gUseP5; 4114 #endif 4115 #if (STP_GEAA_P > 0) 4116 gDst2 = gDst2 + (StpMF2_(4.5) - gDst2) * gUseP4; 4117 gDst2 = gDst2 + (StpMF2_(3.5) - gDst2) * gUseP3; 4118 #endif 4119 gDst2 = gDst2 + (StpMF2_(2.5) - gDst2) * gUseP2; 4120 gDst2 = gDst2 + (StpMF2_(1.5) - gDst2) * gUseP1; 4121 gDst2 = gDst2 + (StpMF2_(0.5) - gDst2) * gUseP0; 4122//------------------------------------------------------------------------------------------------------------------------------ 4123 StpMF1 gLoSub = (gDst2.x + gDst2.y) * StpMF1_(0.5) - StpMF1_(STP_GEAA_SUBPIX); 4124 StpMF2 gLoW01 = StpMF2_(1.0) - StpSatMF2(StpMF2(1.0, 2.0) - StpMF2_(gLoSub)); 4125 StpMF2 gLoW23 = StpMF2_(1.0) - StpSatMF2(StpMF2(3.0, 4.0) - StpMF2_(gLoSub)); 4126 StpMF2 gLoW45 = StpMF2_(1.0) - StpSatMF2(StpMF2(5.0, 6.0) - StpMF2_(gLoSub)); 4127 StpMF2 gLoW67 = StpMF2_(1.0) - StpSatMF2(StpMF2(7.0, 8.0) - StpMF2_(gLoSub)); 4128 StpMF2 gLoW89 = StpMF2_(1.0) - StpSatMF2(StpMF2(9.0,10.0) - StpMF2_(gLoSub)); 4129 StpMF2 gLoAcc2 = 4130 gLo0 * StpMF2_(gLoW01.x) + 4131 gLo1 * StpMF2_(gLoW01.y) + 4132 gLo2 * StpMF2_(gLoW23.x) + 4133 gLo3 * StpMF2_(gLoW23.y) + 4134 gLo4 * StpMF2_(gLoW45.x) + 4135 gLo5 * StpMF2_(gLoW45.y) + 4136 gLo6 * StpMF2_(gLoW67.x) + 4137 gLo7 * StpMF2_(gLoW67.y) + 4138 gLo8 * StpMF2_(gLoW89.x); 4139 StpMF1 gLoAcc = gE + gLoAcc2.x + gLoAcc2.y; 4140 StpMF2 gLoW2 = gLoW01 + gLoW23 + gLoW45 + gLoW67; 4141 gLoW2 *= StpMF2_(2.0); 4142 gLoAcc *= StpRcpMF1(StpMF1_(1.0) + gLoW89.x * StpMF1_(2.0) + gLoW2.x + gLoW2.y); 4143 StpMF1 gOff = StpSatMF1((gLoAcc - gE) * StpRcpMF1(gBH.x - gE)); 4144 gOff = min(gOff, StpMF1_(0.5)); 4145//------------------------------------------------------------------------------------------------------------------------------ 4146 gDilate = p + gDecon; 4147 gFilter = p + gDecon * StpF2_(gOff); 4148 gLuma = lerp(gE, gBH.x, gOff); 4149//------------------------------------------------------------------------------------------------------------------------------ 4150 StpMF1 gAnti = lerp(gE, gBH.x, gOff); 4151 StpMF1 gT = StpSatMF1((StpMF1_(-2.0) * gAnti + gBH.x + gE) * StpRcpMF1(gE - gBH.y)); 4152 StpMF1 gFix = gE * (gT - StpMF1_(1.0)) - gBH.y * gT; 4153 gFix = StpSatMF1((gFix + gAnti) * StpRcpMF1(gFix + gBH.x)); 4154//------------------------------------------------------------------------------------------------------------------------------ 4155 gW = gFix; 4156 gW = StpRcpMF1(gW + StpMF1_(0.5)) - StpMF1_(1.0); 4157 gW *= gW; 4158 gW = max(gW, StpMF1_(1.0/255.0)); } 4159#endif // defined(STP_GPU) && defined(STP_GEAA) && defined(STP_32BIT) 4160//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 4161//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 4162//_____________________________________________________________.._______________________________________________________________ 4163//============================================================================================================================== 4164// [GEAA] PACKED 16-BIT ENTRY POINT 4165//============================================================================================================================== 4166#if defined(STP_GPU) && defined(STP_GEAA) && defined(STP_16BIT) 4167 void StpGeaaH( 4168 out StpH1 gW, // Output weight for pixel art scalar. 4169 out StpH1 gLuma, // Filtered luma for debug. 4170 out StpF2 gFilter, // Location to sample for standalone unscaled spatial AA. 4171 out StpF2 gDilate, // Location of highest contrast neighbor. 4172 StpF2 p, // {0 to 1} position across screen. 4173 StpF2 kRcpI, // 1.0 / input image size in pixels. 4174 StpF2 kHalfRcpI) { // 0.5 / input image size in pixels. 4175//------------------------------------------------------------------------------------------------------------------------------ 4176 // Sample 3x3 input pattern in luma (or green). 4177 // A B C 4178 // D E F 4179 // G H I 4180 // Via four gather4s, usage for the next section to try to improve operand caching. 4181 #if STP_OFFSETS 4182 StpF2 pDEBA = p + StpF2(-kHalfRcpI.x, -kHalfRcpI.y); 4183 StpH4 gDEBA = StpGeaa4H(pDEBA); 4184 StpH4 gEFCB = StpGeaa4OH(pDEBA, StpI2(1, 0)); 4185 StpH4 gGHED = StpGeaa4OH(pDEBA, StpI2(0, 1)); 4186 StpH4 gHIFE = StpGeaa4OH(pDEBA, StpI2(1, 1)); 4187 #else // STP_OFFSETS 4188 StpH4 gDEBA = StpGeaa4H(p + StpF2(-kHalfRcpI.x, -kHalfRcpI.y)); // .xyzw=DEBA 4189 StpH4 gEFCB = StpGeaa4H(p + StpF2( kHalfRcpI.x, -kHalfRcpI.y)); // .yz =FC 4190 StpH4 gGHED = StpGeaa4H(p + StpF2(-kHalfRcpI.x, kHalfRcpI.y)); // .xy =GH 4191 StpH4 gHIFE = StpGeaa4H(p + StpF2( kHalfRcpI.x, kHalfRcpI.y)); // .y =I 4192 #endif // STP_OFFSETS 4193//------------------------------------------------------------------------------------------------------------------------------ 4194 // Compute {horz,vert} change terms. Complex to decide on either horizontal or vertical direction. 4195 // Trouble case for some algorithms, 4196 // 0 1 0 4197 // 0 1 0 4198 // 0 1 0 4199 // This should present as a vertical search direction. 4200 // Simple stuff like sum of each 2x2 produces, 4201 // 2 2 4202 // 2 2 4203 // Which has no direction. 4204 // {ABC,ADG} 4205 StpH2 gHV0,gHV1,gHV2; 4206 gHV0.x = gDEBA.z * StpH1_(-2.0) + gEFCB.z; 4207 gHV0.y = gDEBA.x * StpH1_(-2.0) + gGHED.x; 4208 gHV0 += StpH2_(gDEBA.w); 4209 // {DEF,BEH} 4210 gHV1.x = gDEBA.x + gEFCB.y; 4211 gHV1.y = gDEBA.z + gGHED.y; 4212 gHV1 += StpH2_(gDEBA.y) * StpH2_(-2.0); 4213 // {GHI,CFI} 4214 gHV2.x = gGHED.x + gGHED.y * StpH1_(-2.0); 4215 gHV2.y = gEFCB.z + gEFCB.y * StpH1_(-2.0); 4216 gHV2 += StpH2_(gHIFE.y); 4217 // Combine terms. 4218 #if 0 4219 // What FXAA does, better for a diagonal computation (which is not needed), left for reference. 4220 StpH2 gHV = abs(gHV0) + abs(gHV1) * StpH2_(2.0) + abs(gHV2); 4221 #else 4222 // Slightly faster for packed 16-bit (which has no free ABS on AMD). 4223 StpH2 gHV = gHV0 * gHV0 + gHV1 * gHV1 * StpH2_(2.0) + gHV2 * gHV2; 4224 #endif 4225 // Choose search direction, the 'gVert' is true:=vert, false:=horz. 4226 // Go vertical search if horizontal has higher contrast (search perpendicular). 4227 StpP1 gVert = gHV.x > gHV.y; 4228//------------------------------------------------------------------------------------------------------------------------------ 4229 // This is BH if search horzontal, else DF (as BH) if search vertical. 4230 StpH2 gBH = gVert ? StpH2(gDEBA.x, gEFCB.y) : StpH2(gDEBA.z, gGHED.y); 4231 // Will need these later, will let the compiler move around the transpose. 4232 StpH2 gAC = gVert ? StpH2(gDEBA.w, gGHED.x) : StpH2(gDEBA.w, gEFCB.z); 4233 StpH2 gDF = gVert ? StpH2(gDEBA.z, gGHED.y) : StpH2(gDEBA.x, gEFCB.y); 4234 StpH2 gGI = gVert ? StpH2(gEFCB.y, gHIFE.y) : StpH2(gGHED.x, gHIFE.y); 4235 // Start to compute threshold for end of span, compute a gradient pair. 4236 StpH2 gBHMinusE = gBH - StpH2_(gDEBA.y); 4237 StpH2 gEnd2 = abs(gBHMinusE); 4238 // If gradient is larger upward (or leftward if vert). 4239 StpP1 gUp = gEnd2.x >= gEnd2.y; 4240//------------------------------------------------------------------------------------------------------------------------------ 4241 // Rename. 4242 StpH1 gE = gDEBA.y; 4243 // Swap if not up. From this point on, the B is the high-contrast neighbor, and the H is the other one in same dir. 4244 gBH = gUp ? gBH : gBH.yx; 4245//------------------------------------------------------------------------------------------------------------------------------ 4246 // Choose the bilinear scalar (gets to 1/3 between texels during the search). 4247 // .x ... For texel closer to pixel axis when up (reversed when down). 4248 // .y ... For more distant texel. 4249 // LOGIC 4250 // ===== 4251 // This keeps threshold of 2 of the 3 end conditions the same (so 1/3 shift is better than 1/4). 4252 // ===== 4253 // e e e <- e = end cases 4254 // 0 0 1 1 <- 1/3 of high contrast neighbor 4255 // 0 1 0 1 <- 2/3 of self 4256 // ------------------ 4257 // 0 2/3 1/3 1 <- blended value (2/3 is the target) 4258 // 2/3 0 1/3 1/3 <- abs(difference to target) 4259 StpH2 gBi = gUp ? StpH2(2.0 / 3.0, 1.0 / 3.0) : StpH2(1.0 / 3.0 , 2.0 / 3.0); 4260 // Choose either {B-E, or H-E}. 4261 StpH1 gBMinusE = gUp ? gBHMinusE.x : gBHMinusE.y; 4262 // Finish Bi0, this is the first 2 texture fetches (done using math instead) at P0 (1 texel away from center). 4263 StpH2 gBi0 = (gUp ? gAC : gGI) * StpH2_(1.0 / 3.0) + gDF * StpH2_(2.0 / 3.0); 4264 // Finish Lo0, for the directional blur. 4265 StpH2 gLo0 = gDF; 4266 // Store out spatial neighborhood. 4267 StpH1 gAbsBMinusE = abs(gBMinusE); 4268 // This is just the highest contrast neighbor along the choosen direction, may report less contrast then actual. 4269 StpH1 gNe = gAbsBMinusE; 4270 // Good direction to compare against at the end. 4271 // Good means 'don't flip' to the other side. 4272 // Have 'B-E' want 'signed(E-(B/2+E/2))' = 'signed(E/2-B/2)' = 'signed(E-B)' = 'gtzero(B-E)' 4273 StpH1 gGood = StpGtZeroH1(gBMinusE); 4274//------------------------------------------------------------------------------------------------------------------------------ 4275 // One pixel walk distance for search. 4276 StpF2 gWalk = gVert ? StpF2(0.0, kRcpI.y) : StpF2(kRcpI.x, 0.0); 4277 // This is the direction of decontrast (towards the highest contrast neighbor). 4278 StpF2 gDecon = gVert ? StpF2(kRcpI.x, 0.0) : StpF2(0.0, kRcpI.y); 4279 // If up (or left) work negative. 4280 if(gUp) gDecon = -gDecon; 4281//------------------------------------------------------------------------------------------------------------------------------ 4282 // Have enough now to build out sampling positions. 4283 // This works in gather4 to get two samples per gather, then uses math to finish the bilinear fetch. 4284 // In case the logic ever goes back to a non-gather4 version, this keeps with the 1/3 offset. 4285 // Build base, 1/3 to neighbor pixel. 4286 // It must be 1/3 to neighbor pixel to be able to find the end of thin stuff like this. 4287 // . . . . . . . . . . . 4288 // . . . . . . x x x x x 4289 // . x x x x x . . . . . 4290 // | | 4291 // |------>| 4292 // | . x 4293 // If it was 1/2 to neighbor, then x and . would look the same. 4294 StpF2 gP = p + gDecon * StpF2_(1.0/3.0); 4295 // The gather4 positions are (assuming horizontal then up). 4296 // 3 3 2 2 1 1 0 0 A B C 0 0 1 1 2 2 3 3 4297 // 3 3 2 2 1 1 0 0 D E F 0 0 1 1 2 2 3 3 4298 // G H I 4299//------------------------------------------------------------------------------------------------------------------------------ 4300 // Sampling positions. 4301 // Currently walking without gaps, but could skip along too! 4302 StpF2 gPN3 = gP - StpF2_(8.5) * gWalk; 4303 StpF2 gPN2 = gP - StpF2_(6.5) * gWalk; 4304 StpF2 gPN1 = gP - StpF2_(4.5) * gWalk; 4305 StpF2 gPN0 = gP - StpF2_(2.5) * gWalk; 4306 StpF2 gPP0 = gP + StpF2_(2.5) * gWalk; 4307 StpF2 gPP1 = gP + StpF2_(4.5) * gWalk; 4308 StpF2 gPP2 = gP + StpF2_(6.5) * gWalk; 4309 StpF2 gPP3 = gP + StpF2_(8.5) * gWalk; 4310//------------------------------------------------------------------------------------------------------------------------------ 4311 // This attempts to do sampling in a cache friendly way. 4312 // Cannot sample with offsets, because it could be vertical or horizontal and offsets need to be static in DX. 4313 // Sampling pairs {negative, positive} directions. 4314 StpH4 gGN3, gGN2, gGN1, gGN0, gGP0, gGP1, gGP2, gGP3; 4315 gGN3 = StpGeaa4H(gPN3); 4316 gGN2 = StpGeaa4H(gPN2); 4317 gGN1 = StpGeaa4H(gPN1); 4318 gGN0 = StpGeaa4H(gPN0); 4319 gGP0 = StpGeaa4H(gPP0); 4320 gGP1 = StpGeaa4H(gPP1); 4321 gGP2 = StpGeaa4H(gPP2); 4322 gGP3 = StpGeaa4H(gPP3); 4323//------------------------------------------------------------------------------------------------------------------------------ 4324 // Finish the bilinear fetch. 4325 // For 'vertical' this needs to do a transpose. 4326 // The FMAs are duplicated, else the compiler would need to do that anyway. 4327 // 1st 2nd for N side (P side is reversed) 4328 // ----------- | | 4329 // W Z w z !vert & up ... Y X, Z W 4330 // X Y [p] x y 4331 // ----------- 4332 // W Z [p] w z !vert & !up ... Z W, Y X 4333 // X Y x y 4334 // ----------- 4335 // W Z vert & up ... Y Z, X W 4336 // X Y 4337 // [p] 4338 // w z 4339 // x y 4340 // ----------- 4341 // W Z vert & !up ... X W, Y Z 4342 // X Y | | | | 4343 // [p] | | 0.33 term 4344 // w z | | 4345 // x y 0.66 term 4346 // ----------- 4347 if(gVert) { 4348 gGN3 = gGN3.zyxw; 4349 gGN2 = gGN2.zyxw; 4350 gGN1 = gGN1.zyxw; 4351 gGN0 = gGN0.zyxw; 4352 gGP0 = gGP0.zyxw; 4353 gGP1 = gGP1.zyxw; 4354 gGP2 = gGP2.zyxw; 4355 gGP3 = gGP3.zyxw; } 4356//------------------------------------------------------------------------------------------------------------------------------ 4357 // Grab the texels for the variable length inline low-pass box blur. 4358 StpH2 gLo8 = StpH2(gGN3.x, gGP3.y); 4359 StpH2 gLo7 = StpH2(gGN3.y, gGP3.x); 4360 StpH2 gLo6 = StpH2(gGN2.x, gGP2.y); 4361 StpH2 gLo5 = StpH2(gGN2.y, gGP2.x); 4362 StpH2 gLo4 = StpH2(gGN1.x, gGP1.y); 4363 StpH2 gLo3 = StpH2(gGN1.y, gGP1.x); 4364 StpH2 gLo2 = StpH2(gGN0.x, gGP0.y); 4365 StpH2 gLo1 = StpH2(gGN0.y, gGP0.x); 4366 if(!gUp) { 4367 gLo8 = StpH2(gGN3.w, gGP3.z); 4368 gLo7 = StpH2(gGN3.z, gGP3.w); 4369 gLo6 = StpH2(gGN2.w, gGP2.z); 4370 gLo5 = StpH2(gGN2.z, gGP2.w); 4371 gLo4 = StpH2(gGN1.w, gGP1.z); 4372 gLo3 = StpH2(gGN1.z, gGP1.w); 4373 gLo2 = StpH2(gGN0.w, gGP0.z); 4374 gLo1 = StpH2(gGN0.z, gGP0.w); } 4375//------------------------------------------------------------------------------------------------------------------------------ 4376 // Simulate the bilinear fetch. 4377 StpH2 gGN3Bi = gGN3.yx * StpH2_(gBi.x) + gGN3.zw * StpH2_(gBi.y); 4378 StpH2 gGN2Bi = gGN2.yx * StpH2_(gBi.x) + gGN2.zw * StpH2_(gBi.y); 4379 StpH2 gGN1Bi = gGN1.yx * StpH2_(gBi.x) + gGN1.zw * StpH2_(gBi.y); 4380 StpH2 gGN0Bi = gGN0.yx * StpH2_(gBi.x) + gGN0.zw * StpH2_(gBi.y); 4381 StpH2 gGP0Bi = gGP0.yx * StpH2_(gBi.x) + gGP0.zw * StpH2_(gBi.y); 4382 StpH2 gGP1Bi = gGP1.yx * StpH2_(gBi.x) + gGP1.zw * StpH2_(gBi.y); 4383 StpH2 gGP2Bi = gGP2.yx * StpH2_(gBi.x) + gGP2.zw * StpH2_(gBi.y); 4384 StpH2 gGP3Bi = gGP3.yx * StpH2_(gBi.x) + gGP3.zw * StpH2_(gBi.y); 4385 // Note positive side the {x,y} order is reversed. 4386 StpH2 gBi8 = StpH2(gGN3Bi.y, gGP3Bi.x); 4387 StpH2 gBi7 = StpH2(gGN3Bi.x, gGP3Bi.y); 4388 StpH2 gBi6 = StpH2(gGN2Bi.y, gGP2Bi.x); 4389 StpH2 gBi5 = StpH2(gGN2Bi.x, gGP2Bi.y); 4390 StpH2 gBi4 = StpH2(gGN1Bi.y, gGP1Bi.x); 4391 StpH2 gBi3 = StpH2(gGN1Bi.x, gGP1Bi.y); 4392 StpH2 gBi2 = StpH2(gGN0Bi.y, gGP0Bi.x); 4393 StpH2 gBi1 = StpH2(gGN0Bi.x, gGP0Bi.y); 4394//------------------------------------------------------------------------------------------------------------------------------ 4395 // Threshold for end of span (X), and base to compare against (Y). 4396 StpH2 gEndBase; 4397 // For a (1.0/3.0) pixel shift. 4398 // The 'gBMinusE = other - self', and want 'self * (2.0/3.0) + other * (1.0/3.0)'. 4399 gEndBase.y = gBMinusE * StpH1_(1.0/3.0) + gE; 4400 gEndBase.x = gAbsBMinusE * StpH1_(STP_GEAA_THRESHOLD); 4401 // Safer version here for reference. 4402 #if 0 4403 gEndBase.x = StpRcpH1(max(StpH1_(1.0 / 16384.0), gEndBase.x)); 4404 #else 4405 gEndBase.x = StpPrxLoRcpH1(gEndBase.x); 4406 #endif 4407//------------------------------------------------------------------------------------------------------------------------------ 4408 // Compute opacity term, {0 := not done, 1 := end of span}. 4409 #if (STP_GEAA_P > 2) 4410 StpH2 gUseP8 = StpSatH2(abs(gBi8 - StpH2_(gEndBase.y)) * StpH2_(gEndBase.x)); 4411 StpH2 gUseP7 = StpSatH2(abs(gBi7 - StpH2_(gEndBase.y)) * StpH2_(gEndBase.x)); 4412 #endif 4413 #if (STP_GEAA_P > 1) 4414 StpH2 gUseP6 = StpSatH2(abs(gBi6 - StpH2_(gEndBase.y)) * StpH2_(gEndBase.x)); 4415 StpH2 gUseP5 = StpSatH2(abs(gBi5 - StpH2_(gEndBase.y)) * StpH2_(gEndBase.x)); 4416 #endif 4417 #if (STP_GEAA_P > 0) 4418 StpH2 gUseP4 = StpSatH2(abs(gBi4 - StpH2_(gEndBase.y)) * StpH2_(gEndBase.x)); 4419 StpH2 gUseP3 = StpSatH2(abs(gBi3 - StpH2_(gEndBase.y)) * StpH2_(gEndBase.x)); 4420 #endif 4421 StpH2 gUseP2 = StpSatH2(abs(gBi2 - StpH2_(gEndBase.y)) * StpH2_(gEndBase.x)); 4422 StpH2 gUseP1 = StpSatH2(abs(gBi1 - StpH2_(gEndBase.y)) * StpH2_(gEndBase.x)); 4423 StpH2 gUseP0 = StpSatH2(abs(gBi0 - StpH2_(gEndBase.y)) * StpH2_(gEndBase.x)); 4424//------------------------------------------------------------------------------------------------------------------------------ 4425 // Work this like painters alpha blending. 4426 // This analog path is faster and cleaner than binary logic. 4427 // Distance traveled for {negative, positive} paths. 4428 // LOGIC 4429 // ===== 4430 // Note distance factors already have the 0.5 factored in. 4431 // N := negative search end (1 pixel away, but edge is 0.5 pixel away) 4432 // P := positive search end (4 pixel away, but edge is 3.5 pixel away) 4433 // X := the pixel to filter 4434 // :<->:<------------->: 4435 // : : : 4436 // : : +---+---+---+---+ 4437 // : : | : | | | | 4438 // N +---+---+---+---+-P-+---+---+---+ 4439 // | X | | | | | | | | 4440 // +---+---+---+---+---+---+---+---+---+---+---+---+ 4441 // | | | | | | | | | | | | | 4442 // +---+---+---+---+---+---+---+---+---+---+---+---+ 4443 #if (STP_GEAA_P == 3) 4444 StpH2 gDst2 = StpH2_(9.5); 4445 #endif 4446 #if (STP_GEAA_P == 2) 4447 StpH2 gDst2 = StpH2_(7.5); 4448 #endif 4449 #if (STP_GEAA_P == 1) 4450 StpH2 gDst2 = StpH2_(5.5); 4451 #endif 4452 #if (STP_GEAA_P == 0) 4453 StpH2 gDst2 = StpH2_(3.5); 4454 #endif 4455 #if (STP_GEAA_P > 2) 4456 gDst2 = gDst2 + (StpH2_(8.5) - gDst2) * gUseP8; 4457 gDst2 = gDst2 + (StpH2_(7.5) - gDst2) * gUseP7; 4458 #endif 4459 #if (STP_GEAA_P > 1) 4460 gDst2 = gDst2 + (StpH2_(6.5) - gDst2) * gUseP6; 4461 gDst2 = gDst2 + (StpH2_(5.5) - gDst2) * gUseP5; 4462 #endif 4463 #if (STP_GEAA_P > 0) 4464 gDst2 = gDst2 + (StpH2_(4.5) - gDst2) * gUseP4; 4465 gDst2 = gDst2 + (StpH2_(3.5) - gDst2) * gUseP3; 4466 #endif 4467 gDst2 = gDst2 + (StpH2_(2.5) - gDst2) * gUseP2; 4468 gDst2 = gDst2 + (StpH2_(1.5) - gDst2) * gUseP1; 4469 gDst2 = gDst2 + (StpH2_(0.5) - gDst2) * gUseP0; 4470//------------------------------------------------------------------------------------------------------------------------------ 4471 // Run the variable length low-pass box blur. 4472 // Need half distance with half pixel removed. 4473 StpH1 gLoSub = (gDst2.x + gDst2.y) * StpH1_(0.5) - StpH1_(STP_GEAA_SUBPIX); 4474 // compute the weights (if should be included or not). 4475 StpH2 gLoW01 = StpH2_(1.0) - StpSatH2(StpH2(1.0, 2.0) - StpH2_(gLoSub)); 4476 StpH2 gLoW23 = StpH2_(1.0) - StpSatH2(StpH2(3.0, 4.0) - StpH2_(gLoSub)); 4477 StpH2 gLoW45 = StpH2_(1.0) - StpSatH2(StpH2(5.0, 6.0) - StpH2_(gLoSub)); 4478 StpH2 gLoW67 = StpH2_(1.0) - StpSatH2(StpH2(7.0, 8.0) - StpH2_(gLoSub)); 4479 StpH2 gLoW89 = StpH2_(1.0) - StpSatH2(StpH2(9.0,10.0) - StpH2_(gLoSub)); 4480 // Weighted accumulation of samples. 4481 StpH2 gLoAcc2 = 4482 gLo0 * StpH2_(gLoW01.x) + 4483 gLo1 * StpH2_(gLoW01.y) + 4484 gLo2 * StpH2_(gLoW23.x) + 4485 gLo3 * StpH2_(gLoW23.y) + 4486 gLo4 * StpH2_(gLoW45.x) + 4487 gLo5 * StpH2_(gLoW45.y) + 4488 gLo6 * StpH2_(gLoW67.x) + 4489 gLo7 * StpH2_(gLoW67.y) + 4490 gLo8 * StpH2_(gLoW89.x); 4491 StpH1 gLoAcc = gE + gLoAcc2.x + gLoAcc2.y; 4492 // Weight sum. 4493 StpH2 gLoW2 = gLoW01 + gLoW23 + gLoW45 + gLoW67; 4494 gLoW2 *= StpH2_(2.0); 4495 gLoAcc *= StpRcpH1(StpH1_(1.0) + gLoW89.x * StpH1_(2.0) + gLoW2.x + gLoW2.y); 4496 // Convert to blend between self and high-contrast neighbor. 4497 // This currently allows full {0.0 to 1.0} blend. 4498 StpH1 gOff = StpSatH1((gLoAcc - gE) * StpRcpH1(gBH.x - gE)); 4499 // It is important to not exceed 0.5 weight for PIXart scaling. 4500 gOff = min(gOff, StpH1_(0.5)); 4501//------------------------------------------------------------------------------------------------------------------------------ 4502 // Save out dilation pixel for {z,motion}. 4503 gDilate = p + gDecon; 4504 // Save out filter position. 4505 gFilter = p + gDecon * StpF2_(gOff); 4506 gLuma = lerp(gE, gBH.x, gOff); 4507//------------------------------------------------------------------------------------------------------------------------------ 4508 // GEAA up to this point creates weights that only help a scalar for aliased edges. 4509 // This attempts to increase weight to also restore some anti-aliased edges. 4510 // It does this by increasing weight as much as can be borrowed from the 'E to H' side. 4511 // An equation for movement towards H, 4512 // E+(H-E)*T ... Where T must be {0 to 1} ranged, but want {0 to 0.5} ranged (same as 'gOff'). 4513 // Equation for E motion with respect to the B side, 4514 // A=E+(B-E)*F ... Where A is the anti-aliased output, and F would typically be 'gOff'. 4515 // Solving that for E, 4516 // E=((A-F*B)/(1-F) 4517 // Combining equations, 4518 // E+(H-E)*T = ((A-F*B)/(1-F) 4519 // Then solving for T when 'F=0.5' (maximum 'gOff' weight), 4520 // T=(-2*A+B+E)/(E-H) 4521 // Then limit T inside {0 to 0.5}. 4522 // And use limited 'T' to recompute a new 'F' which becomes the 'gOff' fixed weight. 4523 StpH1 gAnti = lerp(gE, gBH.x, gOff); 4524 // Solve for the movement towards 'H'. 4525 // This in theory should be limited to {0 to 0.5}, but {0 to 1} seems to work too. 4526 StpH1 gT = StpSatH1((StpH1_(-2.0) * gAnti + gBH.x + gE) * StpRcpH1(gE - gBH.y)); 4527 StpH1 gFix = gE * (gT - StpH1_(1.0)) - gBH.y * gT; 4528 gFix = StpSatH1((gFix + gAnti) * StpRcpH1(gFix + gBH.x)); 4529//------------------------------------------------------------------------------------------------------------------------------ 4530 // Output weight for pixel art scalar. 4531 // The 'gOff'set goes between {0 := no change, to 0.5 := half to neighbor}. 4532 // The half to neighbor position would be where the edge crosses between two pixels. 4533 // The sample size needs to be {0 := at the crossing, to 1 := no change}. 4534 // Can solve this, the 1D kernel will look like, 4535 // u = (1-x)*s ... weighting terms 4536 // v = x *t 4537 // w = 1/(u+v) 4538 // o = a*u*w + b*v*w 4539 // The split is where weights are the same, 4540 // u*w == v*w ... ((1-x)*s)/(((1-x)*s)+(x*t)) == (x*t)/(((1-x)*s)+(x*t)) 4541 // Can assume s=1.0 (the other sample), thus this reduces to, 4542 // u*w == v*w ... (1-x)/((1-x)+(x*t)) == (x*t)/((1-x)+(x*t)) 4543 // Then solve for 't' given crossing point 'x'. 4544 // t=1/x-1 4545 // Convert to 'x=gOffset+1/2'. 4546 // Solve for 't=1/x-1', or 't=1/(gOffset+1/2)-1'. 4547 gW = gFix; 4548 gW = StpRcpH1(gW + StpH1_(0.5)) - StpH1_(1.0); 4549 // Send squared (as needed by scalar). 4550 gW *= gW; 4551 // Make sure not zero. 4552 gW = max(gW, StpH1_(1.0/255.0)); } 4553#endif // defined(STP_GPU) && defined(STP_GEAA) && defined(STP_16BIT) 4554//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 4555//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 4556//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 4557//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 4558#endif // STP_UNITY_INCLUDE_GUARD