A game about forced loneliness, made by TACStudios
1#ifndef THREADING 2#define THREADING 3 4/// 5/// Compute Shader Threading Utilities 6/// 7/// This file is intended to provide a portable implementation of the wave-level operations in DirectX Shader Model 6.0. 8/// 9/// The functions in this file will automatically resolve to native intrinsics when possible. 10/// A fallback groupshared memory implementation is used when native support is not available. 11/// 12/// Usage: 13/// 14/// To use this file, define all required preprocessor symbols and then include this file in your compute shader. 15/// 16/// Required Preprocessor Symbols: 17/// 18/// THREADING_BLOCK_SIZE 19/// - The size of the compute shader's flattened thread group size 20/// 21/// Optional Preprocessor Symbols: 22/// 23/// THREADING_WAVE_SIZE 24/// - The size of a wave within the compute shader 25/// - This symbol MUST be defined when authoring shader code that requires a specific wave size for correctness! 26/// 27/// THREADING_FORCE_WAVE_EMULATION 28/// - If defined, forces usage of the fallback groupshared memory implementation 29/// 30 31#ifndef THREADING_BLOCK_SIZE 32#error THREADING_BLOCK_SIZE must be defined as the flattened thread group size. 33#endif 34 35// The emulation path is automatically enabled when we're running on hardware that doesn't meet minimum requirements. 36// 37// In order to use the non-emulated path, the current device must have native support for wave-level operations. 38// If THREADING_WAVE_SIZE is provided, then the device's wave size must also match the size specified by THREADING_WAVE_SIZE. 39// 40// The emulation path can also be forced on via the THREADING_FORCE_WAVE_EMULATION preprocessor symbol for debug/testing purposes. 41#define _THREADING_IS_HW_SUPPORTED (defined(UNITY_HW_SUPPORTS_WAVE) && (!defined(THREADING_WAVE_SIZE) || (defined(UNITY_HW_WAVE_SIZE) && (UNITY_HW_WAVE_SIZE == THREADING_WAVE_SIZE)))) 42#define _THREADING_ENABLE_WAVE_EMULATION (!_THREADING_IS_HW_SUPPORTED || defined(THREADING_FORCE_WAVE_EMULATION)) 43#define _THREADING_GROUP_BALLOT_DWORDS ((THREADING_BLOCK_SIZE + 31u) / 32u) 44 45namespace Threading 46{ 47 struct Wave 48 { 49 // Unfortunately 'private' is a reserved keyword in HLSL. 50 uint indexG; 51 uint indexW; 52#if _THREADING_ENABLE_WAVE_EMULATION 53 uint indexL; 54 uint offset; // Per-wave offset into LDS scratch space. 55#endif 56 57 uint GetIndex(); 58 59 void Init(uint groupIndex); 60 61 #define DECLARE_API_FOR_TYPE(TYPE) \ 62 bool AllEqual(TYPE v); \ 63 TYPE Product(TYPE v); \ 64 TYPE Sum(TYPE v); \ 65 TYPE Max(TYPE v); \ 66 TYPE Min(TYPE v); \ 67 TYPE InclusivePrefixSum(TYPE v); \ 68 TYPE InclusivePrefixProduct(TYPE v); \ 69 TYPE PrefixSum(TYPE v); \ 70 TYPE PrefixProduct(TYPE v); \ 71 TYPE ReadLaneAt(TYPE v, uint i); \ 72 TYPE ReadLaneFirst(TYPE v); \ 73 74 // Currently just support scalars. 75 DECLARE_API_FOR_TYPE(uint) 76 DECLARE_API_FOR_TYPE(int) 77 DECLARE_API_FOR_TYPE(float) 78 79 // The following intrinsics need only be declared once. 80 uint GetLaneCount(); 81 uint GetLaneIndex(); 82 bool IsFirstLane(); 83 bool AllTrue(bool v); 84 bool AnyTrue(bool v); 85 uint4 Ballot(bool v); 86 uint CountBits(bool v); 87 uint PrefixCountBits(bool v); 88 uint And(uint v); 89 uint Or(uint v); 90 uint Xor(uint v); 91 }; 92 93 struct GroupBallot 94 { 95 uint dwords[_THREADING_GROUP_BALLOT_DWORDS]; 96 97 uint CountBits() 98 { 99 uint result = 0; 100 101 [unroll] 102 for (uint dwordIndex = 0; dwordIndex < _THREADING_GROUP_BALLOT_DWORDS; ++dwordIndex) 103 { 104 result += countbits(dwords[dwordIndex]); 105 } 106 107 return result; 108 } 109 }; 110 111 struct Group 112 { 113 uint groupIndex : SV_GroupIndex; 114 uint3 groupID : SV_GroupID; 115 uint3 dispatchID : SV_DispatchThreadID; 116 117 Wave GetWave() 118 { 119 Wave wave; 120 { 121 wave = (Wave)0; 122 wave.Init(groupIndex); 123 } 124 return wave; 125 } 126 127 // Lane remap which is safe for both portability (different wave sizes up to 128) and for 2D wave reductions. 128 // 6543210 129 // ======= 130 // ..xx..x 131 // yy..yy. 132 // Details, 133 // LANE TO 8x16 MAPPING 134 // ==================== 135 // 00 01 08 09 10 11 18 19 136 // 02 03 0a 0b 12 13 1a 1b 137 // 04 05 0c 0d 14 15 1c 1d 138 // 06 07 0e 0f 16 17 1e 1f 139 // 20 21 28 29 30 31 38 39 140 // 22 23 2a 2b 32 33 3a 3b 141 // 24 25 2c 2d 34 35 3c 3d 142 // 26 27 2e 2f 36 37 3e 3f 143 // ....................... 144 // ... repeat the 8x8 .... 145 // .... pattern, but ..... 146 // .... for 40 to 7f ..... 147 // ....................... 148 // NOTE: This function is only intended to be used with one dimensional thread groups 149 uint2 RemapLaneTo8x16() 150 { 151 // Note the BFIs used for MSBs have "strange offsets" due to leaving space for the LSB bits replaced in the BFI. 152 return uint2(BitFieldInsert(1u, groupIndex, BitFieldExtract(groupIndex, 2u, 3u)), 153 BitFieldInsert(3u, BitFieldExtract(groupIndex, 1u, 2u), BitFieldExtract(groupIndex, 3u, 4u))); 154 } 155 156 uint GetWaveCount(); 157 158 #define DECLARE_API_FOR_TYPE_GROUP(TYPE) \ 159 bool AllEqual(TYPE v); \ 160 TYPE Product(TYPE v); \ 161 TYPE Sum(TYPE v); \ 162 TYPE Max(TYPE v); \ 163 TYPE Min(TYPE v); \ 164 TYPE InclusivePrefixSum(TYPE v); \ 165 TYPE InclusivePrefixProduct(TYPE v); \ 166 TYPE PrefixSum(TYPE v); \ 167 TYPE PrefixProduct(TYPE v); \ 168 TYPE ReadThreadAt(TYPE v, uint i); \ 169 TYPE ReadThreadFirst(TYPE v); \ 170 TYPE ReadThreadShuffle(TYPE v, uint i); \ 171 172 // Currently just support scalars. 173 DECLARE_API_FOR_TYPE_GROUP(uint) 174 DECLARE_API_FOR_TYPE_GROUP(int) 175 DECLARE_API_FOR_TYPE_GROUP(float) 176 177 // The following intrinsics need only be declared once. 178 uint GetThreadCount(); 179 uint GetThreadIndex(); 180 bool IsFirstThread(); 181 bool AllTrue(bool v); 182 bool AnyTrue(bool v); 183 GroupBallot Ballot(bool v); 184 uint CountBits(bool v); 185 uint PrefixCountBits(bool v); 186 uint And(uint v); 187 uint Or(uint v); 188 uint Xor(uint v); 189 }; 190} 191 192#if _THREADING_ENABLE_WAVE_EMULATION 193 #include "ThreadingEmuImpl.hlsl" 194#else 195 #include "ThreadingSM6Impl.hlsl" 196#endif 197 198#endif