nixpkgs mirror (for testing) github.com/NixOS/nixpkgs
nix
at python-updates 180 lines 4.7 kB view raw
1{ 2 lib, 3 stdenv, 4 fetchFromGitHub, 5 autoAddDriverRunpath, 6 catch2_3, 7 cmake, 8 ctestCheckHook, 9 coreutils, 10 mpi, 11 mpiCheckPhaseHook, 12 ninja, 13 cudaPackages_12, 14 boost186, 15 fmt_10, 16 git, 17 jsoncpp, 18 libevent, 19 lshw, 20 plog, 21 python3, 22 replaceVars, 23 symlinkJoin, 24 tclap_1_4, 25 util-linux, 26 yaml-cpp, 27}: 28let 29 # DCGM can depend on multiple versions of CUDA at the same time. 30 # The runtime closure, thankfully, is quite small as it does not 31 # include the CUDA libraries. 32 cudaPackageSets = [ 33 cudaPackages_12 34 ]; 35 36 # Select needed redist packages from cudaPackages 37 # C.f. https://github.com/NVIDIA/DCGM/blob/7e1012302679e4bb7496483b32dcffb56e528c92/dcgmbuild/scripts/0080_cuda.sh#L24-L39 38 getCudaPackages = 39 p: with p; [ 40 cuda_cccl 41 cuda_cudart 42 cuda_nvcc 43 cuda_nvml_dev 44 libcublas 45 libcufft 46 libcurand 47 ]; 48 49 # Builds CMake flags to add CUDA paths for include and lib. 50 mkCudaFlags = 51 cudaPackages: 52 let 53 version = cudaPackages.cudaMajorVersion; 54 # The DCGM CMake assumes that the folder containing cuda.h contains all headers, so we must 55 # combine everything together for headers to work. 56 headers = symlinkJoin { 57 name = "cuda-headers-combined-${version}"; 58 paths = lib.map (pkg: "${lib.getInclude pkg}/include") (getCudaPackages cudaPackages); 59 }; 60 in 61 [ 62 (lib.cmakeFeature "CUDA${version}_INCLUDE_DIR" "${headers}") 63 (lib.cmakeFeature "CUDA${version}_LIBS" "${lib.getOutput "stubs" cudaPackages.cuda_cudart}/lib/stubs/libcuda.so") 64 (lib.cmakeFeature "CUDA${version}_STATIC_LIBS" "${lib.getLib cudaPackages.cuda_cudart}/lib/libcudart.so") 65 (lib.cmakeFeature "CUDA${version}_STATIC_CUBLAS_LIBS" ( 66 lib.concatStringsSep ";" [ 67 "${lib.getLib cudaPackages.libcublas}/lib/libcublas.so" 68 "${lib.getLib cudaPackages.libcublas}/lib/libcublasLt.so" 69 ] 70 )) 71 ]; 72in 73stdenv.mkDerivation { 74 pname = "dcgm"; 75 version = "4.3.1"; # N.B: If you change this, be sure prometheus-dcgm-exporter supports this version. 76 77 src = fetchFromGitHub { 78 owner = "NVIDIA"; 79 repo = "DCGM"; 80 # No tag for 4.3.1 yet. 81 #tag = "v${version}"; 82 rev = "1477d8785e899ab3450fdff2b486102e9bed096b"; 83 hash = "sha256-FebqG28aodENGLNBBbiGpckzzeuP+y44dCALtYnN1yU="; 84 }; 85 86 patches = [ 87 ./remove-cuda-11.patch 88 ./dynamic-libs.patch 89 (replaceVars ./fix-paths.patch { 90 inherit coreutils; 91 inherit util-linux; 92 inherit lshw; 93 inherit mpi; 94 inherit (stdenv) shell; 95 dcgm_out = null; 96 }) 97 ]; 98 99 hardeningDisable = [ "all" ]; 100 101 strictDeps = true; 102 103 nativeBuildInputs = [ 104 # autoAddDriverRunpath does not actually depend on or incur any dependency 105 # of cudaPackages. It merely adds an impure, non-Nix PATH to the RPATHs of 106 # executables that need to use cuda at runtime. 107 autoAddDriverRunpath 108 109 cmake 110 ninja 111 git 112 python3 113 ]; 114 115 buildInputs = [ 116 # Header-only 117 boost186 118 catch2_3 119 plog.dev 120 tclap_1_4 121 122 fmt_10 123 yaml-cpp 124 jsoncpp 125 libevent 126 ]; 127 128 nativeCheckInputs = [ 129 mpi 130 ctestCheckHook 131 mpiCheckPhaseHook 132 ]; 133 134 disabledTests = [ 135 # Fail due to lack of `/sys` in the sandbox. 136 "DcgmModuleSysmon::PauseResume Module resumed after initialization" 137 "DcgmModuleSysmon PauseResume Module rejects invalid messages" 138 "DcgmModuleSysmon PauseResume Module accepts valid messages" 139 "DcgmModuleSysmon Watches" 140 "DcgmModuleSysmon maxSampleAge" 141 "DcgmModuleSysmon::CalculateCoreUtilization" 142 "DcgmModuleSysmon::ParseProcStatCpuLine" 143 "DcgmModuleSysmon::ParseThermalFileContentsAndStore" 144 "DcgmModuleSysmon::PopulateTemperatureFileMap" 145 "DcgmModuleSysmon::ReadCoreSpeed" 146 "DcgmModuleSysmon::ReadTemperature" 147 "Sysmon: initialize module" 148 ]; 149 150 # Add our paths to the CMake flags so FindCuda.cmake can find them. 151 cmakeFlags = lib.concatMap mkCudaFlags cudaPackageSets; 152 153 # Lots of dodgy C++. 154 env.NIX_CFLAGS_COMPILE = "-Wno-error"; 155 156 doCheck = true; 157 dontUseNinjaCheck = true; 158 159 postPatch = '' 160 while read -r -d "" file; do 161 substituteInPlace "$file" --replace-quiet @dcgm_out@ "$out" 162 done < <(find . '(' -name '*.h' -or -name '*.cpp' ')' -print0) 163 ''; 164 165 disallowedReferences = lib.concatMap getCudaPackages cudaPackageSets; 166 167 __structuredAttrs = true; 168 169 meta = { 170 description = "Data Center GPU Manager (DCGM) is a daemon that allows users to monitor NVIDIA data-center GPUs"; 171 homepage = "https://developer.nvidia.com/dcgm"; 172 license = lib.licenses.asl20; 173 maintainers = with lib.maintainers; [ 174 de11n 175 despsyched 176 ]; 177 mainProgram = "dcgmi"; 178 platforms = lib.platforms.linux; 179 }; 180}