pkgs/by-name/dc/dcgm/package.nix at python-updates

tjh.dev / nixpkgs
fork atom
nixpkgs mirror (for testing) github.com/NixOS/nixpkgs
nix
fork atom
nixpkgs / pkgs / by-name / dc / dcgm / package.nix
at python-updates 180 lines 4.7 kB view raw
wrap content
  1{
  2  lib,
  3  stdenv,
  4  fetchFromGitHub,
  5  autoAddDriverRunpath,
  6  catch2_3,
  7  cmake,
  8  ctestCheckHook,
  9  coreutils,
 10  mpi,
 11  mpiCheckPhaseHook,
 12  ninja,
 13  cudaPackages_12,
 14  boost186,
 15  fmt_10,
 16  git,
 17  jsoncpp,
 18  libevent,
 19  lshw,
 20  plog,
 21  python3,
 22  replaceVars,
 23  symlinkJoin,
 24  tclap_1_4,
 25  util-linux,
 26  yaml-cpp,
 27}:
 28let
 29  # DCGM can depend on multiple versions of CUDA at the same time.
 30  # The runtime closure, thankfully, is quite small as it does not
 31  # include the CUDA libraries.
 32  cudaPackageSets = [
 33    cudaPackages_12
 34  ];
 35
 36  # Select needed redist packages from cudaPackages
 37  # C.f. https://github.com/NVIDIA/DCGM/blob/7e1012302679e4bb7496483b32dcffb56e528c92/dcgmbuild/scripts/0080_cuda.sh#L24-L39
 38  getCudaPackages =
 39    p: with p; [
 40      cuda_cccl
 41      cuda_cudart
 42      cuda_nvcc
 43      cuda_nvml_dev
 44      libcublas
 45      libcufft
 46      libcurand
 47    ];
 48
 49  # Builds CMake flags to add CUDA paths for include and lib.
 50  mkCudaFlags =
 51    cudaPackages:
 52    let
 53      version = cudaPackages.cudaMajorVersion;
 54      # The DCGM CMake assumes that the folder containing cuda.h contains all headers, so we must
 55      # combine everything together for headers to work.
 56      headers = symlinkJoin {
 57        name = "cuda-headers-combined-${version}";
 58        paths = lib.map (pkg: "${lib.getInclude pkg}/include") (getCudaPackages cudaPackages);
 59      };
 60    in
 61    [
 62      (lib.cmakeFeature "CUDA${version}_INCLUDE_DIR" "${headers}")
 63      (lib.cmakeFeature "CUDA${version}_LIBS" "${lib.getOutput "stubs" cudaPackages.cuda_cudart}/lib/stubs/libcuda.so")
 64      (lib.cmakeFeature "CUDA${version}_STATIC_LIBS" "${lib.getLib cudaPackages.cuda_cudart}/lib/libcudart.so")
 65      (lib.cmakeFeature "CUDA${version}_STATIC_CUBLAS_LIBS" (
 66        lib.concatStringsSep ";" [
 67          "${lib.getLib cudaPackages.libcublas}/lib/libcublas.so"
 68          "${lib.getLib cudaPackages.libcublas}/lib/libcublasLt.so"
 69        ]
 70      ))
 71    ];
 72in
 73stdenv.mkDerivation {
 74  pname = "dcgm";
 75  version = "4.3.1"; # N.B: If you change this, be sure prometheus-dcgm-exporter supports this version.
 76
 77  src = fetchFromGitHub {
 78    owner = "NVIDIA";
 79    repo = "DCGM";
 80    # No tag for 4.3.1 yet.
 81    #tag = "v${version}";
 82    rev = "1477d8785e899ab3450fdff2b486102e9bed096b";
 83    hash = "sha256-FebqG28aodENGLNBBbiGpckzzeuP+y44dCALtYnN1yU=";
 84  };
 85
 86  patches = [
 87    ./remove-cuda-11.patch
 88    ./dynamic-libs.patch
 89    (replaceVars ./fix-paths.patch {
 90      inherit coreutils;
 91      inherit util-linux;
 92      inherit lshw;
 93      inherit mpi;
 94      inherit (stdenv) shell;
 95      dcgm_out = null;
 96    })
 97  ];
 98
 99  hardeningDisable = [ "all" ];
100
101  strictDeps = true;
102
103  nativeBuildInputs = [
104    # autoAddDriverRunpath does not actually depend on or incur any dependency
105    # of cudaPackages. It merely adds an impure, non-Nix PATH to the RPATHs of
106    # executables that need to use cuda at runtime.
107    autoAddDriverRunpath
108
109    cmake
110    ninja
111    git
112    python3
113  ];
114
115  buildInputs = [
116    # Header-only
117    boost186
118    catch2_3
119    plog.dev
120    tclap_1_4
121
122    fmt_10
123    yaml-cpp
124    jsoncpp
125    libevent
126  ];
127
128  nativeCheckInputs = [
129    mpi
130    ctestCheckHook
131    mpiCheckPhaseHook
132  ];
133
134  disabledTests = [
135    # Fail due to lack of `/sys` in the sandbox.
136    "DcgmModuleSysmon::PauseResume Module resumed after initialization"
137    "DcgmModuleSysmon PauseResume Module rejects invalid messages"
138    "DcgmModuleSysmon PauseResume Module accepts valid messages"
139    "DcgmModuleSysmon Watches"
140    "DcgmModuleSysmon maxSampleAge"
141    "DcgmModuleSysmon::CalculateCoreUtilization"
142    "DcgmModuleSysmon::ParseProcStatCpuLine"
143    "DcgmModuleSysmon::ParseThermalFileContentsAndStore"
144    "DcgmModuleSysmon::PopulateTemperatureFileMap"
145    "DcgmModuleSysmon::ReadCoreSpeed"
146    "DcgmModuleSysmon::ReadTemperature"
147    "Sysmon: initialize module"
148  ];
149
150  # Add our paths to the CMake flags so FindCuda.cmake can find them.
151  cmakeFlags = lib.concatMap mkCudaFlags cudaPackageSets;
152
153  # Lots of dodgy C++.
154  env.NIX_CFLAGS_COMPILE = "-Wno-error";
155
156  doCheck = true;
157  dontUseNinjaCheck = true;
158
159  postPatch = ''
160    while read -r -d "" file; do
161      substituteInPlace "$file" --replace-quiet @dcgm_out@ "$out"
162    done < <(find . '(' -name '*.h' -or -name '*.cpp' ')' -print0)
163  '';
164
165  disallowedReferences = lib.concatMap getCudaPackages cudaPackageSets;
166
167  __structuredAttrs = true;
168
169  meta = {
170    description = "Data Center GPU Manager (DCGM) is a daemon that allows users to monitor NVIDIA data-center GPUs";
171    homepage = "https://developer.nvidia.com/dcgm";
172    license = lib.licenses.asl20;
173    maintainers = with lib.maintainers; [
174      de11n
175      despsyched
176    ];
177    mainProgram = "dcgmi";
178    platforms = lib.platforms.linux;
179  };
180}