nixpkgs mirror (for testing)
github.com/NixOS/nixpkgs
nix
1{
2 lib,
3 stdenv,
4 fetchFromGitHub,
5 autoAddDriverRunpath,
6 catch2_3,
7 cmake,
8 ctestCheckHook,
9 coreutils,
10 mpi,
11 mpiCheckPhaseHook,
12 ninja,
13 cudaPackages_12,
14 boost186,
15 fmt_10,
16 git,
17 jsoncpp,
18 libevent,
19 lshw,
20 plog,
21 python3,
22 replaceVars,
23 symlinkJoin,
24 tclap_1_4,
25 util-linux,
26 yaml-cpp,
27}:
28let
29 # DCGM can depend on multiple versions of CUDA at the same time.
30 # The runtime closure, thankfully, is quite small as it does not
31 # include the CUDA libraries.
32 cudaPackageSets = [
33 cudaPackages_12
34 ];
35
36 # Select needed redist packages from cudaPackages
37 # C.f. https://github.com/NVIDIA/DCGM/blob/7e1012302679e4bb7496483b32dcffb56e528c92/dcgmbuild/scripts/0080_cuda.sh#L24-L39
38 getCudaPackages =
39 p: with p; [
40 cuda_cccl
41 cuda_cudart
42 cuda_nvcc
43 cuda_nvml_dev
44 libcublas
45 libcufft
46 libcurand
47 ];
48
49 # Builds CMake flags to add CUDA paths for include and lib.
50 mkCudaFlags =
51 cudaPackages:
52 let
53 version = cudaPackages.cudaMajorVersion;
54 # The DCGM CMake assumes that the folder containing cuda.h contains all headers, so we must
55 # combine everything together for headers to work.
56 headers = symlinkJoin {
57 name = "cuda-headers-combined-${version}";
58 paths = lib.map (pkg: "${lib.getInclude pkg}/include") (getCudaPackages cudaPackages);
59 };
60 in
61 [
62 (lib.cmakeFeature "CUDA${version}_INCLUDE_DIR" "${headers}")
63 (lib.cmakeFeature "CUDA${version}_LIBS" "${lib.getOutput "stubs" cudaPackages.cuda_cudart}/lib/stubs/libcuda.so")
64 (lib.cmakeFeature "CUDA${version}_STATIC_LIBS" "${lib.getLib cudaPackages.cuda_cudart}/lib/libcudart.so")
65 (lib.cmakeFeature "CUDA${version}_STATIC_CUBLAS_LIBS" (
66 lib.concatStringsSep ";" [
67 "${lib.getLib cudaPackages.libcublas}/lib/libcublas.so"
68 "${lib.getLib cudaPackages.libcublas}/lib/libcublasLt.so"
69 ]
70 ))
71 ];
72in
73stdenv.mkDerivation {
74 pname = "dcgm";
75 version = "4.3.1"; # N.B: If you change this, be sure prometheus-dcgm-exporter supports this version.
76
77 src = fetchFromGitHub {
78 owner = "NVIDIA";
79 repo = "DCGM";
80 # No tag for 4.3.1 yet.
81 #tag = "v${version}";
82 rev = "1477d8785e899ab3450fdff2b486102e9bed096b";
83 hash = "sha256-FebqG28aodENGLNBBbiGpckzzeuP+y44dCALtYnN1yU=";
84 };
85
86 patches = [
87 ./remove-cuda-11.patch
88 ./dynamic-libs.patch
89 (replaceVars ./fix-paths.patch {
90 inherit coreutils;
91 inherit util-linux;
92 inherit lshw;
93 inherit mpi;
94 inherit (stdenv) shell;
95 dcgm_out = null;
96 })
97 ];
98
99 hardeningDisable = [ "all" ];
100
101 strictDeps = true;
102
103 nativeBuildInputs = [
104 # autoAddDriverRunpath does not actually depend on or incur any dependency
105 # of cudaPackages. It merely adds an impure, non-Nix PATH to the RPATHs of
106 # executables that need to use cuda at runtime.
107 autoAddDriverRunpath
108
109 cmake
110 ninja
111 git
112 python3
113 ];
114
115 buildInputs = [
116 # Header-only
117 boost186
118 catch2_3
119 plog.dev
120 tclap_1_4
121
122 fmt_10
123 yaml-cpp
124 jsoncpp
125 libevent
126 ];
127
128 nativeCheckInputs = [
129 mpi
130 ctestCheckHook
131 mpiCheckPhaseHook
132 ];
133
134 disabledTests = [
135 # Fail due to lack of `/sys` in the sandbox.
136 "DcgmModuleSysmon::PauseResume Module resumed after initialization"
137 "DcgmModuleSysmon PauseResume Module rejects invalid messages"
138 "DcgmModuleSysmon PauseResume Module accepts valid messages"
139 "DcgmModuleSysmon Watches"
140 "DcgmModuleSysmon maxSampleAge"
141 "DcgmModuleSysmon::CalculateCoreUtilization"
142 "DcgmModuleSysmon::ParseProcStatCpuLine"
143 "DcgmModuleSysmon::ParseThermalFileContentsAndStore"
144 "DcgmModuleSysmon::PopulateTemperatureFileMap"
145 "DcgmModuleSysmon::ReadCoreSpeed"
146 "DcgmModuleSysmon::ReadTemperature"
147 "Sysmon: initialize module"
148 ];
149
150 # Add our paths to the CMake flags so FindCuda.cmake can find them.
151 cmakeFlags = lib.concatMap mkCudaFlags cudaPackageSets;
152
153 # Lots of dodgy C++.
154 env.NIX_CFLAGS_COMPILE = "-Wno-error";
155
156 doCheck = true;
157 dontUseNinjaCheck = true;
158
159 postPatch = ''
160 while read -r -d "" file; do
161 substituteInPlace "$file" --replace-quiet @dcgm_out@ "$out"
162 done < <(find . '(' -name '*.h' -or -name '*.cpp' ')' -print0)
163 '';
164
165 disallowedReferences = lib.concatMap getCudaPackages cudaPackageSets;
166
167 __structuredAttrs = true;
168
169 meta = {
170 description = "Data Center GPU Manager (DCGM) is a daemon that allows users to monitor NVIDIA data-center GPUs";
171 homepage = "https://developer.nvidia.com/dcgm";
172 license = lib.licenses.asl20;
173 maintainers = with lib.maintainers; [
174 de11n
175 despsyched
176 ];
177 mainProgram = "dcgmi";
178 platforms = lib.platforms.linux;
179 };
180}