1{ lib
2, gcc11Stdenv
3, fetchFromGitHub
4, autoAddDriverRunpath
5, catch2
6, cmake
7, cudaPackages_10_2
8, cudaPackages_11_8
9, cudaPackages_12
10, fmt_9
11, git
12, jsoncpp
13, libevent
14, plog
15, python3
16, symlinkJoin
17, tclap_1_4
18, yaml-cpp
19}:
20let
21 # Flags copied from DCGM's libevent build script
22 libevent-nossl = libevent.override { sslSupport = false; };
23 libevent-nossl-static = libevent-nossl.overrideAttrs (super: {
24 CFLAGS = "-Wno-cast-function-type -Wno-implicit-fallthrough -fPIC";
25 CXXFLAGS = "-Wno-cast-function-type -Wno-implicit-fallthrough -fPIC";
26 configureFlags = super.configureFlags ++ [ "--disable-shared" "--with-pic" ];
27 });
28
29 jsoncpp-static = jsoncpp.override { enableStatic = true; };
30
31 # DCGM depends on 3 different versions of CUDA at the same time.
32 # The runtime closure, thankfully, is quite small because most things
33 # are statically linked.
34 cudaPackageSetByVersion = [
35 {
36 version = "10";
37 # Nixpkgs cudaPackages_10 doesn't have redist packages broken out.
38 pkgSet = [
39 cudaPackages_10_2.cudatoolkit
40 cudaPackages_10_2.cudatoolkit.lib
41 ];
42 }
43 {
44 version = "11";
45 pkgSet = getCudaPackages cudaPackages_11_8;
46 }
47 {
48 version = "12";
49 pkgSet = getCudaPackages cudaPackages_12;
50 }
51 ];
52
53 # Select needed redist packages from cudaPackages
54 # C.f. https://github.com/NVIDIA/DCGM/blob/7e1012302679e4bb7496483b32dcffb56e528c92/dcgmbuild/scripts/0080_cuda.sh#L24-L39
55 getCudaPackages = p: with p; [
56 cuda_cccl
57 cuda_cudart
58 cuda_nvcc
59 cuda_nvml_dev
60 libcublas
61 libcufft
62 libcurand
63 ];
64
65 # Builds CMake code to add CUDA paths for include and lib.
66 mkAppendCudaPaths = { version, pkgSet }:
67 let
68 # The DCGM CMake assumes that the folder containing cuda.h contains all headers, so we must
69 # combine everything together for headers to work.
70 # It would be more convenient to use symlinkJoin on *just* the include subdirectories
71 # of each package, but not all of them have an include directory and making that work
72 # is more effort than it's worth for this temporary, build-time package.
73 combined = symlinkJoin {
74 name = "cuda-combined-${version}";
75 paths = pkgSet;
76 };
77 # The combined package above breaks the build for some reason so we just configure
78 # each package's library path.
79 libs = lib.concatMapStringsSep " " (x: ''"${x}/lib"'') pkgSet;
80 in ''
81 list(APPEND Cuda${version}_INCLUDE_PATHS "${combined}/include")
82 list(APPEND Cuda${version}_LIB_PATHS ${libs})
83 '';
84
85# gcc11 is required by DCGM's very particular build system
86# C.f. https://github.com/NVIDIA/DCGM/blob/7e1012302679e4bb7496483b32dcffb56e528c92/dcgmbuild/build.sh#L22
87in gcc11Stdenv.mkDerivation rec {
88 pname = "dcgm";
89 version = "3.2.5"; # N.B: If you change this, be sure prometheus-dcgm-exporter supports this version.
90
91 src = fetchFromGitHub {
92 owner = "NVIDIA";
93 repo = "DCGM";
94 rev = "refs/tags/v${version}";
95 hash = "sha256-iMyYOr3dSpdRV2S/TlB/tEOAWYhK09373ZRbd5vzogQ=";
96 };
97
98 # Add our paths to the CUDA paths so FindCuda.cmake can find them.
99 EXTRA_CUDA_PATHS = lib.concatMapStringsSep "\n" mkAppendCudaPaths cudaPackageSetByVersion;
100 prePatch = ''
101 echo "$EXTRA_CUDA_PATHS"$'\n'"$(cat cmake/FindCuda.cmake)" > cmake/FindCuda.cmake
102 '';
103
104 hardeningDisable = [ "all" ];
105
106 strictDeps = true;
107
108 nativeBuildInputs = [
109 # autoAddDriverRunpath does not actually depend on or incur any dependency
110 # of cudaPackages. It merely adds an impure, non-Nix PATH to the RPATHs of
111 # executables that need to use cuda at runtime.
112 autoAddDriverRunpath
113
114 cmake
115 git
116 python3
117 ];
118
119 buildInputs = [
120 plog.dev # header-only
121 tclap_1_4 # header-only
122
123 catch2
124 fmt_9
125 jsoncpp-static
126 libevent-nossl-static
127 yaml-cpp
128 ];
129
130 disallowedReferences = lib.concatMap (x: x.pkgSet) cudaPackageSetByVersion;
131
132 meta = with lib; {
133 description = "Data Center GPU Manager (DCGM) is a daemon that allows users to monitor NVIDIA data-center GPUs";
134 homepage = "https://developer.nvidia.com/dcgm";
135 license = licenses.asl20;
136 maintainers = teams.deshaw.members;
137 mainProgram = "dcgmi";
138 platforms = platforms.linux;
139 };
140}