rocmPackages.composable_kernel: make more parts big-parallel

All parts except pool are too slow without the higher core limit on big-parallel builders.

Infra channel discussion: https://matrix.to/#/!RROtHmAaQIkiJzJZZE:nixos.org/

Luna Nova 934f488c 0b4a36dc

+8
+8
pkgs/development/rocm-modules/6/composable_kernel/default.nix
··· 36 "device_grouped_conv2d_fwd_instance" 37 "device_grouped_conv2d_fwd_dynamic_op_instance" 38 ]; 39 }; 40 grouped_conv_bwd_3d = { 41 targets = [ ··· 46 "device_grouped_conv3d_bwd_weight_bilinear_instance" 47 "device_grouped_conv3d_bwd_weight_scale_instance" 48 ]; 49 }; 50 grouped_conv_fwd_3d = { 51 targets = [ ··· 60 "device_grouped_conv3d_fwd_scaleadd_ab_instance" 61 "device_grouped_conv3d_fwd_scaleadd_scaleadd_relu_instance" 62 ]; 63 }; 64 batched_gemm = { 65 targets = [ ··· 77 "device_grouped_gemm_fixed_nk_multi_abd_instance" 78 "device_grouped_gemm_tile_loop_instance" 79 ]; 80 }; 81 gemm_universal = { 82 targets = [ ··· 108 "device_gemm_splitk_instance" 109 "device_gemm_streamk_instance" 110 ]; 111 }; 112 conv = { 113 targets = [ ··· 118 "device_conv2d_fwd_bias_relu_add_instance" 119 "device_conv3d_bwd_data_instance" 120 ]; 121 }; 122 pool = { 123 targets = [ ··· 139 "device_normalization_bwd_gamma_beta_instance" 140 "device_normalization_fwd_instance" 141 ]; 142 }; 143 other2 = { 144 targets = [ ··· 150 "device_softmax_instance" 151 "device_transpose_instance" 152 ]; 153 }; 154 }; 155 tensorOpBuilder =
··· 36 "device_grouped_conv2d_fwd_instance" 37 "device_grouped_conv2d_fwd_dynamic_op_instance" 38 ]; 39 + requiredSystemFeatures = [ "big-parallel" ]; 40 }; 41 grouped_conv_bwd_3d = { 42 targets = [ ··· 47 "device_grouped_conv3d_bwd_weight_bilinear_instance" 48 "device_grouped_conv3d_bwd_weight_scale_instance" 49 ]; 50 + requiredSystemFeatures = [ "big-parallel" ]; 51 }; 52 grouped_conv_fwd_3d = { 53 targets = [ ··· 62 "device_grouped_conv3d_fwd_scaleadd_ab_instance" 63 "device_grouped_conv3d_fwd_scaleadd_scaleadd_relu_instance" 64 ]; 65 + requiredSystemFeatures = [ "big-parallel" ]; 66 }; 67 batched_gemm = { 68 targets = [ ··· 80 "device_grouped_gemm_fixed_nk_multi_abd_instance" 81 "device_grouped_gemm_tile_loop_instance" 82 ]; 83 + requiredSystemFeatures = [ "big-parallel" ]; 84 }; 85 gemm_universal = { 86 targets = [ ··· 112 "device_gemm_splitk_instance" 113 "device_gemm_streamk_instance" 114 ]; 115 + requiredSystemFeatures = [ "big-parallel" ]; 116 }; 117 conv = { 118 targets = [ ··· 123 "device_conv2d_fwd_bias_relu_add_instance" 124 "device_conv3d_bwd_data_instance" 125 ]; 126 + requiredSystemFeatures = [ "big-parallel" ]; 127 }; 128 pool = { 129 targets = [ ··· 145 "device_normalization_bwd_gamma_beta_instance" 146 "device_normalization_fwd_instance" 147 ]; 148 + requiredSystemFeatures = [ "big-parallel" ]; 149 }; 150 other2 = { 151 targets = [ ··· 157 "device_softmax_instance" 158 "device_transpose_instance" 159 ]; 160 + requiredSystemFeatures = [ "big-parallel" ]; 161 }; 162 }; 163 tensorOpBuilder =