A game about forced loneliness, made by TACStudios
1#ifndef THREADING
2#define THREADING
3
4///
5/// Compute Shader Threading Utilities
6///
7/// This file is intended to provide a portable implementation of the wave-level operations in DirectX Shader Model 6.0.
8///
9/// The functions in this file will automatically resolve to native intrinsics when possible.
10/// A fallback groupshared memory implementation is used when native support is not available.
11///
12/// Usage:
13///
14/// To use this file, define all required preprocessor symbols and then include this file in your compute shader.
15///
16/// Required Preprocessor Symbols:
17///
18/// THREADING_BLOCK_SIZE
19/// - The size of the compute shader's flattened thread group size
20///
21/// Optional Preprocessor Symbols:
22///
23/// THREADING_WAVE_SIZE
24/// - The size of a wave within the compute shader
25/// - This symbol MUST be defined when authoring shader code that requires a specific wave size for correctness!
26///
27/// THREADING_FORCE_WAVE_EMULATION
28/// - If defined, forces usage of the fallback groupshared memory implementation
29///
30
31#ifndef THREADING_BLOCK_SIZE
32#error THREADING_BLOCK_SIZE must be defined as the flattened thread group size.
33#endif
34
35// The emulation path is automatically enabled when we're running on hardware that doesn't meet minimum requirements.
36//
37// In order to use the non-emulated path, the current device must have native support for wave-level operations.
38// If THREADING_WAVE_SIZE is provided, then the device's wave size must also match the size specified by THREADING_WAVE_SIZE.
39//
40// The emulation path can also be forced on via the THREADING_FORCE_WAVE_EMULATION preprocessor symbol for debug/testing purposes.
41#define _THREADING_IS_HW_SUPPORTED (defined(UNITY_HW_SUPPORTS_WAVE) && (!defined(THREADING_WAVE_SIZE) || (defined(UNITY_HW_WAVE_SIZE) && (UNITY_HW_WAVE_SIZE == THREADING_WAVE_SIZE))))
42#define _THREADING_ENABLE_WAVE_EMULATION (!_THREADING_IS_HW_SUPPORTED || defined(THREADING_FORCE_WAVE_EMULATION))
43#define _THREADING_GROUP_BALLOT_DWORDS ((THREADING_BLOCK_SIZE + 31u) / 32u)
44
45namespace Threading
46{
47 struct Wave
48 {
49 // Unfortunately 'private' is a reserved keyword in HLSL.
50 uint indexG;
51 uint indexW;
52#if _THREADING_ENABLE_WAVE_EMULATION
53 uint indexL;
54 uint offset; // Per-wave offset into LDS scratch space.
55#endif
56
57 uint GetIndex();
58
59 void Init(uint groupIndex);
60
61 #define DECLARE_API_FOR_TYPE(TYPE) \
62 bool AllEqual(TYPE v); \
63 TYPE Product(TYPE v); \
64 TYPE Sum(TYPE v); \
65 TYPE Max(TYPE v); \
66 TYPE Min(TYPE v); \
67 TYPE InclusivePrefixSum(TYPE v); \
68 TYPE InclusivePrefixProduct(TYPE v); \
69 TYPE PrefixSum(TYPE v); \
70 TYPE PrefixProduct(TYPE v); \
71 TYPE ReadLaneAt(TYPE v, uint i); \
72 TYPE ReadLaneFirst(TYPE v); \
73
74 // Currently just support scalars.
75 DECLARE_API_FOR_TYPE(uint)
76 DECLARE_API_FOR_TYPE(int)
77 DECLARE_API_FOR_TYPE(float)
78
79 // The following intrinsics need only be declared once.
80 uint GetLaneCount();
81 uint GetLaneIndex();
82 bool IsFirstLane();
83 bool AllTrue(bool v);
84 bool AnyTrue(bool v);
85 uint4 Ballot(bool v);
86 uint CountBits(bool v);
87 uint PrefixCountBits(bool v);
88 uint And(uint v);
89 uint Or(uint v);
90 uint Xor(uint v);
91 };
92
93 struct GroupBallot
94 {
95 uint dwords[_THREADING_GROUP_BALLOT_DWORDS];
96
97 uint CountBits()
98 {
99 uint result = 0;
100
101 [unroll]
102 for (uint dwordIndex = 0; dwordIndex < _THREADING_GROUP_BALLOT_DWORDS; ++dwordIndex)
103 {
104 result += countbits(dwords[dwordIndex]);
105 }
106
107 return result;
108 }
109 };
110
111 struct Group
112 {
113 uint groupIndex : SV_GroupIndex;
114 uint3 groupID : SV_GroupID;
115 uint3 dispatchID : SV_DispatchThreadID;
116
117 Wave GetWave()
118 {
119 Wave wave;
120 {
121 wave = (Wave)0;
122 wave.Init(groupIndex);
123 }
124 return wave;
125 }
126
127 // Lane remap which is safe for both portability (different wave sizes up to 128) and for 2D wave reductions.
128 // 6543210
129 // =======
130 // ..xx..x
131 // yy..yy.
132 // Details,
133 // LANE TO 8x16 MAPPING
134 // ====================
135 // 00 01 08 09 10 11 18 19
136 // 02 03 0a 0b 12 13 1a 1b
137 // 04 05 0c 0d 14 15 1c 1d
138 // 06 07 0e 0f 16 17 1e 1f
139 // 20 21 28 29 30 31 38 39
140 // 22 23 2a 2b 32 33 3a 3b
141 // 24 25 2c 2d 34 35 3c 3d
142 // 26 27 2e 2f 36 37 3e 3f
143 // .......................
144 // ... repeat the 8x8 ....
145 // .... pattern, but .....
146 // .... for 40 to 7f .....
147 // .......................
148 // NOTE: This function is only intended to be used with one dimensional thread groups
149 uint2 RemapLaneTo8x16()
150 {
151 // Note the BFIs used for MSBs have "strange offsets" due to leaving space for the LSB bits replaced in the BFI.
152 return uint2(BitFieldInsert(1u, groupIndex, BitFieldExtract(groupIndex, 2u, 3u)),
153 BitFieldInsert(3u, BitFieldExtract(groupIndex, 1u, 2u), BitFieldExtract(groupIndex, 3u, 4u)));
154 }
155
156 uint GetWaveCount();
157
158 #define DECLARE_API_FOR_TYPE_GROUP(TYPE) \
159 bool AllEqual(TYPE v); \
160 TYPE Product(TYPE v); \
161 TYPE Sum(TYPE v); \
162 TYPE Max(TYPE v); \
163 TYPE Min(TYPE v); \
164 TYPE InclusivePrefixSum(TYPE v); \
165 TYPE InclusivePrefixProduct(TYPE v); \
166 TYPE PrefixSum(TYPE v); \
167 TYPE PrefixProduct(TYPE v); \
168 TYPE ReadThreadAt(TYPE v, uint i); \
169 TYPE ReadThreadFirst(TYPE v); \
170 TYPE ReadThreadShuffle(TYPE v, uint i); \
171
172 // Currently just support scalars.
173 DECLARE_API_FOR_TYPE_GROUP(uint)
174 DECLARE_API_FOR_TYPE_GROUP(int)
175 DECLARE_API_FOR_TYPE_GROUP(float)
176
177 // The following intrinsics need only be declared once.
178 uint GetThreadCount();
179 uint GetThreadIndex();
180 bool IsFirstThread();
181 bool AllTrue(bool v);
182 bool AnyTrue(bool v);
183 GroupBallot Ballot(bool v);
184 uint CountBits(bool v);
185 uint PrefixCountBits(bool v);
186 uint And(uint v);
187 uint Or(uint v);
188 uint Xor(uint v);
189 };
190}
191
192#if _THREADING_ENABLE_WAVE_EMULATION
193 #include "ThreadingEmuImpl.hlsl"
194#else
195 #include "ThreadingSM6Impl.hlsl"
196#endif
197
198#endif