Library/PackageCache/com.unity.render-pipelines.core/Runtime/STP/StpSetup.compute at master · tacstudios.tngl.sh/AloneGame

A game about forced loneliness, made by TACStudios
AloneGame / Library / PackageCache / com.unity.render-pipelines.core / Runtime / STP / StpSetup.compute
at master 509 lines 22 kB view raw
  1#pragma kernel StpSetup
  2
  3#pragma multi_compile _ ENABLE_DEBUG_MODE
  4#pragma multi_compile _ ENABLE_STENCIL_RESPONSIVE
  5#pragma multi_compile _ ENABLE_LARGE_KERNEL
  6
  7#pragma multi_compile _ UNITY_DEVICE_SUPPORTS_NATIVE_16BIT
  8
  9// TODO: Re-enable support for wave reductions (usage of UNITY_DEVICE_SUPPORTS_WAVE_ANY keyword)
 10//
 11// We've run into many platform specific problems when trying to use wave operations for STP's reductions so they're being
 12// disabled for now. Enabling support for wave operations also causes us to use DXC on the 32-bit path on some Qualcomm Android
 13// devices and this triggers visual artifacts that we have no other way to work around at the moment.
 14
 15#pragma multi_compile _ DISABLE_TEXTURE2D_X_ARRAY
 16
 17#pragma only_renderers d3d11 playstation xboxone xboxseries vulkan metal switch
 18
 19#include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Common.hlsl"
 20#include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Color.hlsl"
 21#include "Packages/com.unity.render-pipelines.core/ShaderLibrary/UnityInstancing.hlsl"
 22
 23#define STP_PAT 1
 24
 25#include "Packages/com.unity.render-pipelines.core/Runtime/STP/StpCommon.hlsl"
 26
 27//
 28// Input
 29//
 30
 31TEXTURE2D_X(_StpInputColor);
 32TEXTURE2D_X(_StpInputDepth);
 33TEXTURE2D_X(_StpInputMotion);
 34
 35#if defined(ENABLE_STENCIL_RESPONSIVE)
 36    TYPED_TEXTURE2D_X(uint2, _StpInputStencil);
 37#endif
 38
 39//
 40// Intermediate Output
 41//
 42
 43RW_TEXTURE2D_X(float4, _StpIntermediateColor);
 44RW_TEXTURE2D_X(float, _StpIntermediateConvergence);
 45
 46//
 47// History Input/Output
 48//
 49
 50TYPED_TEXTURE2D_X(uint, _StpPriorDepthMotion);
 51RW_TEXTURE2D_X(uint, _StpDepthMotion);
 52
 53TEXTURE2D_X(_StpPriorLuma);
 54RW_TEXTURE2D_X(float2, _StpLuma);
 55
 56TEXTURE2D_X(_StpPriorConvergence);
 57
 58TEXTURE2D_X(_StpPriorFeedback);
 59
 60#define STP_SETUP_PER_VIEW_CONSTANTS_STEREO_OFFSET (SLICE_ARRAY_INDEX * STPSETUPPERVIEWCONSTANTS_COUNT)
 61
 62#if defined(SHADER_API_PSSL) || defined(SHADER_API_SWITCH) || (defined(SHADER_API_METAL) && !defined(SHADER_API_MOBILE))
 63    // Force usage of the 32-bit reduction path even in 16-bit environments
 64    #define STP_FORCE_32BIT_REDUCTION
 65#endif
 66
 67#if defined(SHADER_API_PSSL) || defined(SHADER_API_GAMECORE) || defined(SHADER_API_METAL) || (defined(SHADER_API_VULKAN) && defined(SHADER_API_MOBILE))
 68    // Force usage of group shared memory instead using wave operations
 69    #define STP_FORCE_GROUPSHARED
 70#endif
 71
 72// Enable the use of wave operations when they're supported by the current hardware and usage of groupshared hasn't been forced.
 73#if defined(UNITY_HW_SUPPORTS_WAVE) && !defined(STP_FORCE_GROUPSHARED)
 74    #define STP_ENABLE_WAVEOPS
 75#endif
 76
 77// STP requires a 4x4 reduction which must be implemented by either wave operations, or group shared memory.
 78#if !defined(STP_ENABLE_WAVEOPS)
 79#if defined(STP_16BIT) && !defined(STP_FORCE_32BIT_REDUCTION)
 80groupshared uint4 gs_StpScratch[STP_GROUP_SIZE];
 81#else
 82groupshared float4 gs_StpScratch[STP_GROUP_SIZE * 2];
 83#endif
 84#endif
 85
 86// In some cases, we have to expose the 32-bit reduction code in the 16-bit path
 87#if defined(STP_32BIT) || defined(STP_FORCE_32BIT_REDUCTION)
 88void StpPat4x4MaxF8(StpMU1 i, inout StpF4 a, inout StpF4 b)
 89{
 90#if defined(STP_ENABLE_WAVEOPS)
 91    a.x = max(a.x, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.x), WaveGetLaneIndex() ^ 1)));
 92    a.y = max(a.y, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.y), WaveGetLaneIndex() ^ 1)));
 93    a.z = max(a.z, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.z), WaveGetLaneIndex() ^ 1)));
 94    a.w = max(a.w, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.w), WaveGetLaneIndex() ^ 1)));
 95    b.x = max(b.x, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.x), WaveGetLaneIndex() ^ 1)));
 96    b.y = max(b.y, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.y), WaveGetLaneIndex() ^ 1)));
 97    b.z = max(b.z, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.z), WaveGetLaneIndex() ^ 1)));
 98    b.w = max(b.w, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.w), WaveGetLaneIndex() ^ 1)));
 99
100    a.x = max(a.x, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.x), WaveGetLaneIndex() ^ 2)));
101    a.y = max(a.y, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.y), WaveGetLaneIndex() ^ 2)));
102    a.z = max(a.z, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.z), WaveGetLaneIndex() ^ 2)));
103    a.w = max(a.w, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.w), WaveGetLaneIndex() ^ 2)));
104    b.x = max(b.x, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.x), WaveGetLaneIndex() ^ 2)));
105    b.y = max(b.y, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.y), WaveGetLaneIndex() ^ 2)));
106    b.z = max(b.z, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.z), WaveGetLaneIndex() ^ 2)));
107    b.w = max(b.w, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.w), WaveGetLaneIndex() ^ 2)));
108
109    a.x = max(a.x, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.x), WaveGetLaneIndex() ^ 4)));
110    a.y = max(a.y, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.y), WaveGetLaneIndex() ^ 4)));
111    a.z = max(a.z, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.z), WaveGetLaneIndex() ^ 4)));
112    a.w = max(a.w, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.w), WaveGetLaneIndex() ^ 4)));
113    b.x = max(b.x, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.x), WaveGetLaneIndex() ^ 4)));
114    b.y = max(b.y, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.y), WaveGetLaneIndex() ^ 4)));
115    b.z = max(b.z, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.z), WaveGetLaneIndex() ^ 4)));
116    b.w = max(b.w, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.w), WaveGetLaneIndex() ^ 4)));
117
118    a.x = max(a.x, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.x), WaveGetLaneIndex() ^ 8)));
119    a.y = max(a.y, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.y), WaveGetLaneIndex() ^ 8)));
120    a.z = max(a.z, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.z), WaveGetLaneIndex() ^ 8)));
121    a.w = max(a.w, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.w), WaveGetLaneIndex() ^ 8)));
122    b.x = max(b.x, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.x), WaveGetLaneIndex() ^ 8)));
123    b.y = max(b.y, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.y), WaveGetLaneIndex() ^ 8)));
124    b.z = max(b.z, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.z), WaveGetLaneIndex() ^ 8)));
125    b.w = max(b.w, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.w), WaveGetLaneIndex() ^ 8)));
126#else
127    gs_StpScratch[i] = a;
128    gs_StpScratch[i + STP_GROUP_SIZE] = b;
129
130    GroupMemoryBarrierWithGroupSync();
131
132    // 2x2 Reduction
133    {
134        StpMU1 offset = (i & ~StpMU1(3));
135
136        StpMU1 a0 = offset + ((i + StpMU1(1)) & StpMU1(3));
137        StpMU1 a1 = offset + ((i + StpMU1(2)) & StpMU1(3));
138        StpMU1 a2 = offset + ((i + StpMU1(3)) & StpMU1(3));
139
140        float4 x0 = gs_StpScratch[a0];
141        float4 x1 = gs_StpScratch[a1];
142        float4 x2 = gs_StpScratch[a2];
143
144        float4 y0 = gs_StpScratch[a0 + STP_GROUP_SIZE];
145        float4 y1 = gs_StpScratch[a1 + STP_GROUP_SIZE];
146        float4 y2 = gs_StpScratch[a2 + STP_GROUP_SIZE];
147
148        GroupMemoryBarrierWithGroupSync();
149
150        a = max(max(max(a, x0), x1), x2);
151        b = max(max(max(b, y0), y1), y2);
152    }
153
154    gs_StpScratch[i] = a;
155    gs_StpScratch[i + STP_GROUP_SIZE] = b;
156
157    GroupMemoryBarrierWithGroupSync();
158
159    // 4x4 Reduction
160    {
161        StpMU1 offset = (i & ~StpMU1(15));
162
163        StpMU1 a0 = offset + ((i + StpMU1(4)) & StpMU1(15));
164        StpMU1 a1 = offset + ((i + StpMU1(8)) & StpMU1(15));
165        StpMU1 a2 = offset + ((i + StpMU1(12)) & StpMU1(15));
166
167        float4 x0 = gs_StpScratch[a0];
168        float4 x1 = gs_StpScratch[a1];
169        float4 x2 = gs_StpScratch[a2];
170
171        float4 y0 = gs_StpScratch[a0 + STP_GROUP_SIZE];
172        float4 y1 = gs_StpScratch[a1 + STP_GROUP_SIZE];
173        float4 y2 = gs_StpScratch[a2 + STP_GROUP_SIZE];
174
175        GroupMemoryBarrierWithGroupSync();
176
177        a = max(max(max(a, x0), x1), x2);
178        b = max(max(max(b, y0), y1), y2);
179    }
180#endif
181}
182void StpPat4x4SumF4(StpMU1 i, inout StpF4 a)
183{
184#if defined(STP_ENABLE_WAVEOPS)
185    a.x += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.x), WaveGetLaneIndex() ^ 1));
186    a.y += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.y), WaveGetLaneIndex() ^ 1));
187    a.z += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.z), WaveGetLaneIndex() ^ 1));
188    a.w += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.w), WaveGetLaneIndex() ^ 1));
189
190    a.x += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.x), WaveGetLaneIndex() ^ 2));
191    a.y += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.y), WaveGetLaneIndex() ^ 2));
192    a.z += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.z), WaveGetLaneIndex() ^ 2));
193    a.w += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.w), WaveGetLaneIndex() ^ 2));
194
195    a.x += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.x), WaveGetLaneIndex() ^ 4));
196    a.y += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.y), WaveGetLaneIndex() ^ 4));
197    a.z += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.z), WaveGetLaneIndex() ^ 4));
198    a.w += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.w), WaveGetLaneIndex() ^ 4));
199
200    a.x += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.x), WaveGetLaneIndex() ^ 8));
201    a.y += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.y), WaveGetLaneIndex() ^ 8));
202    a.z += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.z), WaveGetLaneIndex() ^ 8));
203    a.w += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.w), WaveGetLaneIndex() ^ 8));
204#else
205    gs_StpScratch[i] = a;
206
207    GroupMemoryBarrierWithGroupSync();
208
209    // 2x2 Reduction
210    {
211        StpMU1 offset = (i & ~StpMU1(3));
212
213        StpMU1 a0 = offset + ((i + StpMU1(1)) & StpMU1(3));
214        StpMU1 a1 = offset + ((i + StpMU1(2)) & StpMU1(3));
215        StpMU1 a2 = offset + ((i + StpMU1(3)) & StpMU1(3));
216
217        float4 x0 = gs_StpScratch[a0];
218        float4 x1 = gs_StpScratch[a1];
219        float4 x2 = gs_StpScratch[a2];
220
221        GroupMemoryBarrierWithGroupSync();
222
223        a = a + x0 + x1 + x2;
224    }
225
226    gs_StpScratch[i] = a;
227
228    GroupMemoryBarrierWithGroupSync();
229
230    // 4x4 Reduction
231    {
232        StpMU1 offset = (i & ~StpMU1(15));
233
234        StpMU1 a0 = offset + ((i + StpMU1(4)) & StpMU1(15));
235        StpMU1 a1 = offset + ((i + StpMU1(8)) & StpMU1(15));
236        StpMU1 a2 = offset + ((i + StpMU1(12)) & StpMU1(15));
237
238        float4 x0 = gs_StpScratch[a0];
239        float4 x1 = gs_StpScratch[a1];
240        float4 x2 = gs_StpScratch[a2];
241
242        GroupMemoryBarrierWithGroupSync();
243
244        a = a + x0 + x1 + x2;
245    }
246#endif
247}
248#endif
249
250#if defined(STP_16BIT)
251void StpPat4x4MaxH8(StpW1 i, inout StpH4 a, inout StpH4 b)
252{
253#if defined(STP_FORCE_32BIT_REDUCTION)
254    StpPat4x4MaxF8(i, a, b);
255#else
256#if defined(STP_ENABLE_WAVEOPS)
257    a.xy = max(a.xy, StpH2_U1(WaveReadLaneAt(StpU1_H2(a.xy), WaveGetLaneIndex() ^ 1)));
258    a.zw = max(a.zw, StpH2_U1(WaveReadLaneAt(StpU1_H2(a.zw), WaveGetLaneIndex() ^ 1)));
259    b.xy = max(b.xy, StpH2_U1(WaveReadLaneAt(StpU1_H2(b.xy), WaveGetLaneIndex() ^ 1)));
260    b.zw = max(b.zw, StpH2_U1(WaveReadLaneAt(StpU1_H2(b.zw), WaveGetLaneIndex() ^ 1)));
261
262    a.xy = max(a.xy, StpH2_U1(WaveReadLaneAt(StpU1_H2(a.xy), WaveGetLaneIndex() ^ 2)));
263    a.zw = max(a.zw, StpH2_U1(WaveReadLaneAt(StpU1_H2(a.zw), WaveGetLaneIndex() ^ 2)));
264    b.xy = max(b.xy, StpH2_U1(WaveReadLaneAt(StpU1_H2(b.xy), WaveGetLaneIndex() ^ 2)));
265    b.zw = max(b.zw, StpH2_U1(WaveReadLaneAt(StpU1_H2(b.zw), WaveGetLaneIndex() ^ 2)));
266
267    a.xy = max(a.xy, StpH2_U1(WaveReadLaneAt(StpU1_H2(a.xy), WaveGetLaneIndex() ^ 4)));
268    a.zw = max(a.zw, StpH2_U1(WaveReadLaneAt(StpU1_H2(a.zw), WaveGetLaneIndex() ^ 4)));
269    b.xy = max(b.xy, StpH2_U1(WaveReadLaneAt(StpU1_H2(b.xy), WaveGetLaneIndex() ^ 4)));
270    b.zw = max(b.zw, StpH2_U1(WaveReadLaneAt(StpU1_H2(b.zw), WaveGetLaneIndex() ^ 4)));
271
272    a.xy = max(a.xy, StpH2_U1(WaveReadLaneAt(StpU1_H2(a.xy), WaveGetLaneIndex() ^ 8)));
273    a.zw = max(a.zw, StpH2_U1(WaveReadLaneAt(StpU1_H2(a.zw), WaveGetLaneIndex() ^ 8)));
274    b.xy = max(b.xy, StpH2_U1(WaveReadLaneAt(StpU1_H2(b.xy), WaveGetLaneIndex() ^ 8)));
275    b.zw = max(b.zw, StpH2_U1(WaveReadLaneAt(StpU1_H2(b.zw), WaveGetLaneIndex() ^ 8)));
276#else
277    gs_StpScratch[i] = StpU4(StpU1_H2(a.xy), StpU1_H2(a.zw), StpU1_H2(b.xy), StpU1_H2(b.zw));
278
279    GroupMemoryBarrierWithGroupSync();
280
281    // 2x2 Reduction
282    {
283        StpW1 offset = (i & ~StpW1(3));
284
285        StpW1 a0 = offset + ((i + StpW1(1)) & StpW1(3));
286        StpW1 a1 = offset + ((i + StpW1(2)) & StpW1(3));
287        StpW1 a2 = offset + ((i + StpW1(3)) & StpW1(3));
288
289        uint4 x0 = gs_StpScratch[a0];
290        uint4 x1 = gs_StpScratch[a1];
291        uint4 x2 = gs_StpScratch[a2];
292
293        GroupMemoryBarrierWithGroupSync();
294
295        a.xy = max(max(max(a.xy, StpH2_U1(x0.x)), StpH2_U1(x1.x)), StpH2_U1(x2.x));
296        a.zw = max(max(max(a.zw, StpH2_U1(x0.y)), StpH2_U1(x1.y)), StpH2_U1(x2.y));
297        b.xy = max(max(max(b.xy, StpH2_U1(x0.z)), StpH2_U1(x1.z)), StpH2_U1(x2.z));
298        b.zw = max(max(max(b.zw, StpH2_U1(x0.w)), StpH2_U1(x1.w)), StpH2_U1(x2.w));
299    }
300
301    gs_StpScratch[i] = StpU4(StpU1_H2(a.xy), StpU1_H2(a.zw), StpU1_H2(b.xy), StpU1_H2(b.zw));
302
303    GroupMemoryBarrierWithGroupSync();
304
305    // 4x4 Reduction
306    {
307        StpW1 offset = (i & ~StpW1(15));
308
309        StpW1 a0 = offset + ((i + StpW1(4)) & StpW1(15));
310        StpW1 a1 = offset + ((i + StpW1(8)) & StpW1(15));
311        StpW1 a2 = offset + ((i + StpW1(12)) & StpW1(15));
312
313        uint4 x0 = gs_StpScratch[a0];
314        uint4 x1 = gs_StpScratch[a1];
315        uint4 x2 = gs_StpScratch[a2];
316
317        GroupMemoryBarrierWithGroupSync();
318
319        a.xy = max(max(max(a.xy, StpH2_U1(x0.x)), StpH2_U1(x1.x)), StpH2_U1(x2.x));
320        a.zw = max(max(max(a.zw, StpH2_U1(x0.y)), StpH2_U1(x1.y)), StpH2_U1(x2.y));
321        b.xy = max(max(max(b.xy, StpH2_U1(x0.z)), StpH2_U1(x1.z)), StpH2_U1(x2.z));
322        b.zw = max(max(max(b.zw, StpH2_U1(x0.w)), StpH2_U1(x1.w)), StpH2_U1(x2.w));
323    }
324#endif
325#endif
326}
327void StpPat4x4SumH4(StpW1 i, inout StpH4 a)
328{
329#if defined(STP_FORCE_32BIT_REDUCTION)
330    StpPat4x4SumF4(i, a);
331#else
332#if defined(STP_ENABLE_WAVEOPS)
333    a.xy += StpH2_U1(WaveReadLaneAt(StpU1_H2(a.xy), WaveGetLaneIndex() ^ 1));
334    a.zw += StpH2_U1(WaveReadLaneAt(StpU1_H2(a.zw), WaveGetLaneIndex() ^ 1));
335
336    a.xy += StpH2_U1(WaveReadLaneAt(StpU1_H2(a.xy), WaveGetLaneIndex() ^ 2));
337    a.zw += StpH2_U1(WaveReadLaneAt(StpU1_H2(a.zw), WaveGetLaneIndex() ^ 2));
338
339    a.xy += StpH2_U1(WaveReadLaneAt(StpU1_H2(a.xy), WaveGetLaneIndex() ^ 4));
340    a.zw += StpH2_U1(WaveReadLaneAt(StpU1_H2(a.zw), WaveGetLaneIndex() ^ 4));
341
342    a.xy += StpH2_U1(WaveReadLaneAt(StpU1_H2(a.xy), WaveGetLaneIndex() ^ 8));
343    a.zw += StpH2_U1(WaveReadLaneAt(StpU1_H2(a.zw), WaveGetLaneIndex() ^ 8));
344#else
345    gs_StpScratch[i].xy = StpU2(StpU1_H2(a.xy), StpU1_H2(a.zw));
346
347    GroupMemoryBarrierWithGroupSync();
348
349    // 2x2 Reduction
350    {
351        StpW1 offset = (i & ~StpW1(3));
352
353        StpW1 a0 = offset + ((i + StpW1(1)) & StpW1(3));
354        StpW1 a1 = offset + ((i + StpW1(2)) & StpW1(3));
355        StpW1 a2 = offset + ((i + StpW1(3)) & StpW1(3));
356
357        uint2 x0 = gs_StpScratch[a0].xy;
358        uint2 x1 = gs_StpScratch[a1].xy;
359        uint2 x2 = gs_StpScratch[a2].xy;
360
361        GroupMemoryBarrierWithGroupSync();
362
363        a.xy = a.xy + StpH2_U1(x0.x) + StpH2_U1(x1.x) + StpH2_U1(x2.x);
364        a.zw = a.zw + StpH2_U1(x0.y) + StpH2_U1(x1.y) + StpH2_U1(x2.y);
365    }
366
367    gs_StpScratch[i].xy = StpU2(StpU1_H2(a.xy), StpU1_H2(a.zw));
368
369    GroupMemoryBarrierWithGroupSync();
370
371    // 4x4 Reduction
372    {
373        StpW1 offset = (i & ~StpW1(15));
374
375        StpW1 a0 = offset + ((i + StpW1(4)) & StpW1(15));
376        StpW1 a1 = offset + ((i + StpW1(8)) & StpW1(15));
377        StpW1 a2 = offset + ((i + StpW1(12)) & StpW1(15));
378
379        uint2 x0 = gs_StpScratch[a0].xy;
380        uint2 x1 = gs_StpScratch[a1].xy;
381        uint2 x2 = gs_StpScratch[a2].xy;
382
383        GroupMemoryBarrierWithGroupSync();
384
385        a.xy = a.xy + StpH2_U1(x0.x) + StpH2_U1(x1.x) + StpH2_U1(x2.x);
386        a.zw = a.zw + StpH2_U1(x0.y) + StpH2_U1(x1.y) + StpH2_U1(x2.y);
387    }
388#endif
389#endif
390}
391StpH1 StpPatPriConH(StpF2 p) { return (StpH1)SAMPLE_TEXTURE2D_X_LOD(_StpPriorConvergence, s_linear_clamp_sampler, p, 0); }
392
393// These are separate to support inline operation (pass merged instead of loads).
394StpF2 StpPatDatMotH(StpW2 o) { return LOAD_TEXTURE2D_X_LOD(_StpInputMotion, o, 0).xy; }
395StpH3 StpPatDatColH(StpW2 o) { return (StpH3)LOAD_TEXTURE2D_X_LOD(_StpInputColor, o, 0).rgb; }
396StpF1 StpPatDatZH(StpW2 o) { return LOAD_TEXTURE2D_X_LOD(_StpInputDepth, o, 0).x; }
397// This provides a place to convert Z from depth to linear if not inlined and actually loaded.
398StpF1 StpPatFixZH(StpF1 z) { return 1.0 / (STP_ZBUFFER_PARAMS_Z * z + STP_ZBUFFER_PARAMS_W); }
399StpU1 StpPatDatRH(StpW2 o) {
400    #if defined(ENABLE_STENCIL_RESPONSIVE)
401        return GetStencilValue(LOAD_TEXTURE2D_X_LOD(_StpInputStencil, o, 0).xy);
402    #endif // defined(ENABLE_STENCIL_RESPONSIVE)
403    return StpU1_(0); }
404StpH1 StpPatFixRH(StpU1 v) {
405    // Activate the "responsive" feature when we don't have valid history textures.
406    bool hasValidHistory = DecodeHasValidHistory(STP_COMMON_CONSTANT);
407    bool excludeTaa = false;
408    #if defined(ENABLE_STENCIL_RESPONSIVE)
409        excludeTaa = (v & DecodeStencilMask(STP_COMMON_CONSTANT)) != 0;
410    #endif // defined(ENABLE_STENCIL_RESPONSIVE)
411    return (hasValidHistory && !excludeTaa) ? StpH1_(1.0) : StpH1_(0.0); }
412
413StpH1 StpPatDitH(StpW2 o) { return StpDitH1(o); }
414StpH4 StpPatPriFedH(StpF2 p) { return (StpH4)SAMPLE_TEXTURE2D_X_LOD(_StpPriorFeedback, s_linear_clamp_sampler, p, 0); }
415StpH4 StpPatPriFedR4H(StpF2 p) { return (StpH4)GATHER_RED_TEXTURE2D_X(_StpPriorFeedback, s_point_clamp_sampler, p); }
416StpH4 StpPatPriFedG4H(StpF2 p) { return (StpH4)GATHER_GREEN_TEXTURE2D_X(_StpPriorFeedback, s_point_clamp_sampler, p); }
417StpH4 StpPatPriFedB4H(StpF2 p) { return (StpH4)GATHER_BLUE_TEXTURE2D_X(_StpPriorFeedback, s_point_clamp_sampler, p); }
418StpH2 StpPatPriLumH(StpF2 p) { return (StpH2)SAMPLE_TEXTURE2D_X_LOD(_StpPriorLuma, s_linear_clamp_sampler, p, 0); }
419StpU4 StpPatPriMot4H(StpF2 p) { return GATHER_RED_TEXTURE2D_X(_StpPriorDepthMotion, s_point_clamp_sampler, p); }
420
421void StpPatStMotH(StpW2 p, StpU1 v) { _StpDepthMotion[COORD_TEXTURE2D_X(p)] = v; }
422void StpPatStColH(StpW2 p, StpH4 v) { _StpIntermediateColor[COORD_TEXTURE2D_X(p)] = v; }
423void StpPatStLumH(StpW2 p, StpH2 v) { _StpLuma[COORD_TEXTURE2D_X(p)] = v; }
424void StpPatStCnvH(StpW2 p, StpH1 v) { _StpIntermediateConvergence[COORD_TEXTURE2D_X(p >> StpW1(2))] = v; }
425#endif
426
427#if defined(STP_32BIT)
428StpMF1 StpPatPriConF(StpF2 p) { return (StpMF1)SAMPLE_TEXTURE2D_X_LOD(_StpPriorConvergence, s_linear_clamp_sampler, p, 0); }
429
430// These are separate to support inline operation (pass merged instead of loads).
431StpF2 StpPatDatMotF(StpMU2 o) { return LOAD_TEXTURE2D_X_LOD(_StpInputMotion, o, 0).xy; }
432StpMF3 StpPatDatColF(StpMU2 o) { return (StpMF3)LOAD_TEXTURE2D_X_LOD(_StpInputColor, o, 0).rgb; }
433StpF1 StpPatDatZF(StpMU2 o) { return LOAD_TEXTURE2D_X_LOD(_StpInputDepth, o, 0).x; }
434// This provides a place to convert Z from depth to linear if not inlined and actually loaded.
435StpF1 StpPatFixZF(StpF1 z) { return 1.0 / (STP_ZBUFFER_PARAMS_Z * z + STP_ZBUFFER_PARAMS_W); }
436StpU1 StpPatDatRF(StpMU2 o) {
437    #if defined(ENABLE_STENCIL_RESPONSIVE)
438        return GetStencilValue(LOAD_TEXTURE2D_X_LOD(_StpInputStencil, o, 0).xy);
439    #endif // defined(ENABLE_STENCIL_RESPONSIVE)
440    return StpU1_(0); }
441StpMF1 StpPatFixRF(StpU1 v) {
442    // Activate the "responsive" feature when we don't have valid history textures.
443    bool hasValidHistory = DecodeHasValidHistory(STP_COMMON_CONSTANT);
444    bool excludeTaa = false;
445    #if defined(ENABLE_STENCIL_RESPONSIVE)
446        excludeTaa = (v & DecodeStencilMask(STP_COMMON_CONSTANT)) != 0;
447    #endif // defined(ENABLE_STENCIL_RESPONSIVE)
448    return (hasValidHistory && !excludeTaa) ? StpMF1_(1.0) : StpMF1_(0.0); }
449
450StpMF1 StpPatDitF(StpMU2 o) { return (StpMF1)StpDitF1(o); }
451StpMF4 StpPatPriFedF(StpF2 p) { return (StpMF4)SAMPLE_TEXTURE2D_X_LOD(_StpPriorFeedback, s_linear_clamp_sampler, p, 0); }
452StpMF4 StpPatPriFedR4F(StpF2 p) { return (StpMF4)GATHER_RED_TEXTURE2D_X(_StpPriorFeedback, s_point_clamp_sampler, p); }
453StpMF4 StpPatPriFedG4F(StpF2 p) { return (StpMF4)GATHER_GREEN_TEXTURE2D_X(_StpPriorFeedback, s_point_clamp_sampler, p); }
454StpMF4 StpPatPriFedB4F(StpF2 p) { return (StpMF4)GATHER_BLUE_TEXTURE2D_X(_StpPriorFeedback, s_point_clamp_sampler, p); }
455StpMF2 StpPatPriLumF(StpF2 p) { return (StpMF2)SAMPLE_TEXTURE2D_X_LOD(_StpPriorLuma, s_linear_clamp_sampler, p, 0); }
456StpU4 StpPatPriMot4F(StpF2 p) { return GATHER_RED_TEXTURE2D_X(_StpPriorDepthMotion, s_point_clamp_sampler, p); }
457
458void StpPatStMotF(StpMU2 p, StpU1 v) { _StpDepthMotion[COORD_TEXTURE2D_X(p)] = v; }
459void StpPatStColF(StpMU2 p, StpMF4 v) { _StpIntermediateColor[COORD_TEXTURE2D_X(p)] = v; }
460void StpPatStLumF(StpMU2 p, StpMF2 v) { _StpLuma[COORD_TEXTURE2D_X(p)] = v; }
461void StpPatStCnvF(StpMU2 p, StpMF1 v) { _StpIntermediateConvergence[COORD_TEXTURE2D_X(p >> StpMU1(2))] = v; }
462#endif
463
464#define THREADING_BLOCK_SIZE STP_GROUP_SIZE
465#include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Threading.hlsl"
466
467[numthreads(STP_GROUP_SIZE, 1, 1)]
468void StpSetup(Threading::Group group)
469{
470    UNITY_XR_ASSIGN_VIEW_INDEX(group.groupID.z);
471
472#if defined(STP_16BIT)
473    StpW1 lane = StpW1_(group.groupIndex);
474    StpW2 groupPos = ComputeGroupPos(StpW2(group.groupID.xy));
475    StpW2 pos = groupPos + StpRemapLaneTo8x16H(lane);
476#else
477    StpMU1 lane = StpMU1_(group.groupIndex);
478    StpMU2 groupPos = ComputeGroupPos(StpMU2(group.groupID.xy));
479    StpMU2 pos = groupPos + StpRemapLaneTo8x16F(lane);
480#endif
481
482#if defined(STP_16BIT)
483    StpPatH(
484        lane,
485        pos,
486#else
487    StpPatF(
488        lane,
489        pos,
490#endif
491
492        asuint(_StpSetupConstants0),
493        asuint(_StpSetupConstants1),
494        asuint(_StpSetupConstants2),
495        asuint(_StpSetupConstants3),
496        asuint(_StpSetupConstants4),
497        asuint(_StpSetupConstants5),
498
499        asuint(_StpSetupPerViewConstants[STP_SETUP_PER_VIEW_CONSTANTS_STEREO_OFFSET + 0]),
500        asuint(_StpSetupPerViewConstants[STP_SETUP_PER_VIEW_CONSTANTS_STEREO_OFFSET + 1]),
501        asuint(_StpSetupPerViewConstants[STP_SETUP_PER_VIEW_CONSTANTS_STEREO_OFFSET + 2]),
502        asuint(_StpSetupPerViewConstants[STP_SETUP_PER_VIEW_CONSTANTS_STEREO_OFFSET + 3]),
503        asuint(_StpSetupPerViewConstants[STP_SETUP_PER_VIEW_CONSTANTS_STEREO_OFFSET + 4]),
504        asuint(_StpSetupPerViewConstants[STP_SETUP_PER_VIEW_CONSTANTS_STEREO_OFFSET + 5]),
505        asuint(_StpSetupPerViewConstants[STP_SETUP_PER_VIEW_CONSTANTS_STEREO_OFFSET + 6]),
506        asuint(_StpSetupPerViewConstants[STP_SETUP_PER_VIEW_CONSTANTS_STEREO_OFFSET + 7])
507    );
508}
509