nixpkgs mirror (for testing) github.com/NixOS/nixpkgs
nix
at python-updates 317 lines 11 kB view raw
1{ 2 lib, 3 clr, 4 composable_kernel_base, 5}: 6 7let 8 inherit (composable_kernel_base) miOpenReqLibsOnly; 9 parts = { 10 _mha = { 11 enabled = !miOpenReqLibsOnly; 12 # mha takes ~3hrs on 64 cores on an EPYC milan system at ~2.5GHz 13 # big-parallel builders are one gen newer and clocked ~30% higher but only 24 cores 14 # Should be <10h timeout but might be cutting it close 15 # TODO: work out how to split this into smaller chunks instead of all 3k mha instances together 16 # mha_0,1,2, search ninja target file for the individual instances, split by the index? 17 # TODO: can we prune the generated instances down to only what in practice are used with popular models 18 # when using flash-attention + MHA kernels? 19 targets = [ 20 "device_mha_instance" 21 ]; 22 extraCmakeFlags = [ "-DHIP_CLANG_NUM_PARALLEL_JOBS=2" ]; 23 }; 24 gemm_multiply_multiply = { 25 enabled = !miOpenReqLibsOnly; 26 targets = [ 27 "device_gemm_multiply_multiply_instance" 28 ]; 29 extraCmakeFlags = [ "-DHIP_CLANG_NUM_PARALLEL_JOBS=2" ]; 30 onlyFor = [ 31 "gfx942" 32 "gfx950" 33 ]; 34 }; 35 gemm_multiply_multiply_wp = { 36 enabled = !miOpenReqLibsOnly; 37 targets = [ 38 "device_gemm_multiply_multiply_wp_instance" 39 ]; 40 extraCmakeFlags = [ "-DHIP_CLANG_NUM_PARALLEL_JOBS=2" ]; 41 onlyFor = [ 42 "gfx942" 43 "gfx950" 44 ]; 45 }; 46 grouped_conv_bwd = { 47 targets = [ 48 "device_grouped_conv1d_bwd_weight_instance" 49 "device_grouped_conv2d_bwd_data_instance" 50 "device_grouped_conv2d_bwd_weight_instance" 51 ]; 52 }; 53 grouped_conv_fwd = { 54 targets = [ 55 "device_grouped_conv1d_fwd_instance" 56 "device_grouped_conv2d_fwd_instance" 57 "device_grouped_conv2d_fwd_bias_clamp_instance" 58 "device_grouped_conv2d_fwd_clamp_instance" 59 "device_grouped_conv2d_fwd_dynamic_op_instance" 60 ]; 61 }; 62 grouped_conv_bwd_3d1 = { 63 targets = [ 64 "device_grouped_conv3d_bwd_data_instance" 65 "device_grouped_conv3d_bwd_data_bilinear_instance" 66 "device_grouped_conv3d_bwd_data_scale_instance" 67 ]; 68 }; 69 grouped_conv_bwd_3d2 = { 70 targets = [ 71 "device_grouped_conv3d_bwd_weight_instance" 72 "device_grouped_conv3d_bwd_weight_bilinear_instance" 73 "device_grouped_conv3d_bwd_weight_scale_instance" 74 ]; 75 }; 76 grouped_conv_fwd_3d1 = { 77 targets = [ 78 "device_grouped_conv3d_fwd_instance" 79 "device_grouped_conv3d_fwd_clamp_instance" 80 "device_grouped_conv3d_fwd_bias_clamp_instance" 81 "device_grouped_conv3d_fwd_bilinear_instance" 82 "device_grouped_conv3d_fwd_convinvscale_instance" 83 "device_grouped_conv3d_fwd_convscale_instance" 84 "device_grouped_conv3d_fwd_convscale_add_instance" 85 ]; 86 }; 87 grouped_conv_fwd_3d2 = { 88 targets = [ 89 "device_grouped_conv3d_fwd_convscale_relu_instance" 90 "device_grouped_conv3d_fwd_dynamic_op_instance" 91 "device_grouped_conv3d_fwd_scale_instance" 92 "device_grouped_conv3d_fwd_scaleadd_ab_instance" 93 "device_grouped_conv3d_fwd_scaleadd_scaleadd_relu_instance" 94 ]; 95 }; 96 grouped_conv_fwd_nd = { 97 targets = [ 98 "device_grouped_convnd_bwd_weight_instance" 99 ]; 100 }; 101 batched_gemm1 = { 102 enabled = !miOpenReqLibsOnly; 103 targets = [ 104 "device_batched_gemm_instance" 105 "device_batched_gemm_b_scale_instance" 106 "device_batched_gemm_multi_d_instance" 107 "device_batched_gemm_add_relu_gemm_add_instance" 108 "device_batched_gemm_bias_permute_instance" 109 "device_batched_gemm_gemm_instance" 110 "device_batched_gemm_reduce_instance" 111 "device_batched_gemm_softmax_gemm_instance" 112 ]; 113 }; 114 batched_gemm2 = { 115 enabled = !miOpenReqLibsOnly; 116 targets = [ 117 "device_batched_gemm_softmax_gemm_permute_instance" 118 "device_grouped_gemm_instance" 119 "device_grouped_gemm_bias_instance" 120 "device_grouped_gemm_fastgelu_instance" 121 "device_grouped_gemm_fixed_nk_instance" 122 "device_grouped_gemm_fixed_nk_multi_abd_instance" 123 "device_grouped_gemm_tile_loop_instance" 124 ]; 125 }; 126 gemm_universal1 = { 127 enabled = !miOpenReqLibsOnly; 128 targets = [ 129 "device_gemm_universal_instance" 130 "device_gemm_universal_batched_instance" 131 ]; 132 }; 133 gemm_universal2 = { 134 enabled = !miOpenReqLibsOnly; 135 targets = [ 136 "device_gemm_universal_reduce_instance" 137 "device_gemm_universal_streamk_instance" 138 ]; 139 }; 140 gemm_other1 = { 141 enabled = !miOpenReqLibsOnly; 142 targets = [ 143 "device_gemm_instance" 144 "device_gemm_b_scale_instance" 145 "device_gemm_ab_scale_instance" 146 "device_gemm_add_instance" 147 "device_gemm_add_add_fastgelu_instance" 148 "device_gemm_add_fastgelu_instance" 149 "device_gemm_add_multiply_instance" 150 "device_gemm_add_relu_instance" 151 ]; 152 }; 153 gemm_other2 = { 154 enabled = !miOpenReqLibsOnly; 155 targets = [ 156 "device_gemm_add_relu_add_layernorm_instance" 157 "device_gemm_add_silu_instance" 158 "device_gemm_bias_add_reduce_instance" 159 "device_gemm_bilinear_instance" 160 "device_gemm_fastgelu_instance" 161 "device_gemm_multi_abd_instance" 162 "device_gemm_multiply_add_instance" 163 "device_gemm_reduce_instance" 164 "device_gemm_splitk_instance" 165 "device_gemm_streamk_instance" 166 ]; 167 }; 168 conv = { 169 targets = [ 170 "device_conv1d_bwd_data_instance" 171 "device_conv2d_bwd_data_instance" 172 "device_conv2d_fwd_instance" 173 "device_conv2d_fwd_bias_relu_instance" 174 "device_conv2d_fwd_bias_relu_add_instance" 175 "device_conv3d_bwd_data_instance" 176 ]; 177 }; 178 pool = { 179 enabled = !miOpenReqLibsOnly; 180 targets = [ 181 "device_avg_pool2d_bwd_instance" 182 "device_avg_pool3d_bwd_instance" 183 "device_pool2d_fwd_instance" 184 "device_pool3d_fwd_instance" 185 "device_max_pool_bwd_instance" 186 ]; 187 }; 188 other0 = { 189 targets = [ 190 "device_quantization_instance" 191 ]; 192 }; 193 other1 = { 194 enabled = !miOpenReqLibsOnly; 195 targets = [ 196 "device_batchnorm_instance" 197 "device_contraction_bilinear_instance" 198 "device_contraction_scale_instance" 199 "device_elementwise_instance" 200 "device_elementwise_normalization_instance" 201 ]; 202 }; 203 other2 = { 204 enabled = !miOpenReqLibsOnly; 205 targets = [ 206 "device_column_to_image_instance" 207 "device_image_to_column_instance" 208 "device_permute_scale_instance" 209 "device_reduce_instance" 210 ]; 211 }; 212 other3 = { 213 enabled = !miOpenReqLibsOnly; 214 targets = [ 215 "device_normalization_bwd_data_instance" 216 "device_normalization_bwd_gamma_beta_instance" 217 "device_normalization_fwd_instance" 218 "device_softmax_instance" 219 "device_transpose_instance" 220 ]; 221 }; 222 }; 223 tensorOpBuilder = 224 { 225 part, 226 targets, 227 extraCmakeFlags ? [ ], 228 requiredSystemFeatures ? [ "big-parallel" ], 229 enabled ? true, 230 onlyFor ? [ ], 231 }: 232 let 233 supported = 234 enabled 235 && (onlyFor == [ ] || (lib.lists.intersectLists composable_kernel_base.gpuTargets onlyFor) != [ ]); 236 in 237 if supported then 238 (composable_kernel_base.overrideAttrs (old: { 239 inherit requiredSystemFeatures; 240 pname = "composable_kernel${clr.gpuArchSuffix}-${part}"; 241 makeTargets = targets; 242 preBuild = '' 243 echo "Building ${part}" 244 makeFlagsArray+=($makeTargets) 245 substituteInPlace $(find ./ -name "Makefile" -type f) \ 246 --replace-fail '.NOTPARALLEL:' '.UNUSED_NOTPARALLEL:' 247 ''; 248 249 # Compile parallelism adjusted based on available RAM 250 # Never uses less than NIX_BUILD_CORES/4, never uses more than NIX_BUILD_CORES 251 # CK uses an unusually high amount of memory per core in the build step 252 # Nix/nixpkgs doesn't really have any infra to tell it that this build is unusually memory hungry 253 # So, bodge. Otherwise you end up having to build all of ROCm with a low core limit when 254 # it's only this package that has trouble. 255 preConfigure = old.preConfigure or "" + '' 256 MEM_GB_TOTAL=$(awk '/MemTotal/ { printf "%d \n", $2/1024/1024 }' /proc/meminfo) 257 MEM_GB_AVAILABLE=$(awk '/MemAvailable/ { printf "%d \n", $2/1024/1024 }' /proc/meminfo) 258 APPX_GB=$((MEM_GB_AVAILABLE > MEM_GB_TOTAL ? MEM_GB_TOTAL : MEM_GB_AVAILABLE)) 259 MAX_CORES=$((1 + APPX_GB/3)) 260 MAX_CORES=$((MAX_CORES < NIX_BUILD_CORES/3 ? NIX_BUILD_CORES/3 : MAX_CORES)) 261 export NIX_BUILD_CORES="$((NIX_BUILD_CORES > MAX_CORES ? MAX_CORES : NIX_BUILD_CORES))" 262 echo "Picked new core limit NIX_BUILD_CORES=$NIX_BUILD_CORES based on available mem: $APPX_GB GB" 263 cmakeFlagsArray+=( 264 "-DCK_PARALLEL_COMPILE_JOBS=$NIX_BUILD_CORES" 265 ) 266 ''; 267 cmakeFlags = old.cmakeFlags ++ extraCmakeFlags; 268 # Early exit after build phase with success, skips fixups etc 269 # Will get copied back into /build of the final CK 270 postBuild = '' 271 find . -name "*.o" -type f | while read -r file; do 272 mkdir -p "$out/$(dirname "$file")" 273 cp --reflink=auto "$file" "$out/$file" 274 done 275 exit 0 276 ''; 277 meta = old.meta // { 278 broken = false; 279 }; 280 })) 281 else 282 null; 283 composable_kernel_parts = builtins.mapAttrs ( 284 part: targets: tensorOpBuilder (targets // { inherit part; }) 285 ) parts; 286in 287 288composable_kernel_base.overrideAttrs ( 289 finalAttrs: old: { 290 pname = "composable_kernel${clr.gpuArchSuffix}"; 291 parts_dirs = builtins.filter (x: x != null) (builtins.attrValues composable_kernel_parts); 292 disallowedReferences = builtins.filter (x: x != null) (builtins.attrValues composable_kernel_parts); 293 preBuild = '' 294 for dir in $parts_dirs; do 295 find "$dir" -type f -name "*.o" | while read -r file; do 296 # Extract the relative path by removing the output directory prefix 297 rel_path="''${file#"$dir/"}" 298 299 # Create parent directory if it doesn't exist 300 mkdir -p "$(dirname "$rel_path")" 301 302 # Copy the file back to its original location, give it a future timestamp 303 # so make treats it as up to date 304 cp --reflink=auto --no-preserve=all "$file" "$rel_path" 305 touch -d "now +10 hours" "$rel_path" 306 done 307 done 308 ''; 309 passthru = old.passthru // { 310 parts = composable_kernel_parts; 311 }; 312 meta = old.meta // { 313 # Builds without any gfx9 fail 314 broken = !finalAttrs.passthru.anyGfx9Target; 315 }; 316 } 317)