pkgs/development/cuda-modules/packages/nccl.nix at python-updates

tjh.dev / nixpkgs
fork atom
nixpkgs mirror (for testing) github.com/NixOS/nixpkgs
nix
fork atom
nixpkgs / pkgs / development / cuda-modules / packages / nccl.nix
at python-updates 169 lines 4.6 kB view raw
wrap content
  1{
  2  _cuda,
  3  backendStdenv,
  4  cuda_cccl,
  5  cuda_cudart,
  6  cuda_nvcc,
  7  cudaAtLeast,
  8  cudaNamePrefix,
  9  fetchFromGitHub,
 10  flags,
 11  lib,
 12  python3,
 13  removeReferencesTo,
 14  which,
 15  # passthru.updateScript
 16  gitUpdater,
 17}:
 18let
 19  inherit (_cuda.lib) _mkMetaBadPlatforms;
 20  inherit (backendStdenv) hasJetsonCudaCapability requestedJetsonCudaCapabilities;
 21  inherit (lib)
 22    all
 23    flip
 24    getAttr
 25    getBin
 26    getInclude
 27    getLib
 28    licenses
 29    maintainers
 30    optionalString
 31    teams
 32    versionAtLeast
 33    versionOlder
 34    ;
 35in
 36backendStdenv.mkDerivation (finalAttrs: {
 37  __structuredAttrs = true;
 38  strictDeps = true;
 39
 40  # NOTE: Depends on the CUDA package set, so use cudaNamePrefix.
 41  name = "${cudaNamePrefix}-${finalAttrs.pname}-${finalAttrs.version}";
 42  pname = "nccl";
 43
 44  # NOTE:
 45  #   Compilation errors resulting from newer versions of NCCL on older releases of CUDA seem to be caused (mostly)
 46  #   by differences in assumed version of CCCL: using a newer CCCL with an older release of CUDA can (sometimes) allow
 47  #   newer versions of NCCL than what we provide here.
 48  version =
 49    if cudaAtLeast "11.7" then
 50      "2.28.7-1"
 51    else if cudaAtLeast "11.6" then
 52      "2.26.6-1"
 53    else
 54      "2.25.1-1";
 55
 56  src = fetchFromGitHub {
 57    owner = "NVIDIA";
 58    repo = "nccl";
 59    tag = "v${finalAttrs.version}";
 60    hash = getAttr finalAttrs.version {
 61      "2.28.7-1" = "sha256-NM19OiBBGmv3cGoVoRLKSh9Y59hiDoei9NIrRnTqWeA=";
 62      "2.26.6-1" = "sha256-vkWMGXCy+dIpYCecdafmOAGlnfRxIQ5Y2ZQuMjinraI=";
 63      "2.25.1-1" = "sha256-3snh0xdL9I5BYqdbqdl+noizJoI38mZRVOJChgEE1I8=";
 64    };
 65  };
 66
 67  outputs = [
 68    "out"
 69    "dev"
 70    "static"
 71  ];
 72
 73  nativeBuildInputs = [
 74    cuda_nvcc
 75    python3
 76    removeReferencesTo
 77    which
 78  ];
 79
 80  buildInputs = [
 81    (getInclude cuda_nvcc)
 82    cuda_cccl
 83    cuda_cudart
 84  ];
 85
 86  env.NIX_CFLAGS_COMPILE = toString [ "-Wno-unused-function" ];
 87
 88  postPatch = ''
 89    patchShebangs ./src/device/generate.py
 90    patchShebangs ./src/device/symmetric/generate.py
 91
 92    nixLog "patching $PWD/makefiles/common.mk to remove NVIDIA's ccbin declaration"
 93    substituteInPlace ./makefiles/common.mk \
 94      --replace-fail \
 95        '-ccbin $(CXX)' \
 96        ""
 97  ''
 98  # 2.27.3-1 was the first to introuce CXXSTD
 99  + optionalString (versionOlder finalAttrs.version "2.27.3-1") ''
100    nixLog "patching $PWD/makefiles/common.mk to remove NVIDIA's std hardcoding"
101    substituteInPlace ./makefiles/common.mk \
102      --replace-fail \
103        '-std=c++11' \
104        '$(CXXSTD)'
105  '';
106
107  # TODO: This would likely break under cross; need to delineate between build and host packages.
108  makeFlags = [
109    "CXXSTD=-std=c++17"
110    "CUDA_HOME=${getBin cuda_nvcc}"
111    "CUDA_INC=${getInclude cuda_cudart}/include"
112    "CUDA_LIB=${getLib cuda_cudart}/lib"
113    "NVCC_GENCODE=${flags.gencodeString}"
114    "PREFIX=$(out)"
115  ];
116
117  enableParallelBuilding = true;
118
119  postFixup = ''
120    _overrideFirst outputStatic "static" "lib" "out"
121    moveToOutput lib/libnccl_static.a "''${!outputStatic:?}"
122  ''
123  # Since CUDA 12.8, the cuda_nvcc path leaks in:
124  # - libnccl.so's .nv_fatbin section
125  # - libnccl_static.a
126  # &devrt -L /nix/store/00000000000000000000000000000000-...nvcc-.../bin/...
127  # This string makes cuda_nvcc a runtime dependency of nccl.
128  # See https://github.com/NixOS/nixpkgs/pull/457803
129  + ''
130    remove-references-to -t "${lib.getBin cuda_nvcc}" \
131      ''${!outputLib}/lib/libnccl.so.* \
132      ''${!outputStatic}/lib/*.a
133  '';
134
135  # C.f. remove-references-to above. Ensure *all* references to cuda_nvcc are removed
136  disallowedRequisites = [ (lib.getBin cuda_nvcc) ];
137
138  passthru = {
139    platformAssertions = [
140      {
141        message = "Pre-Thor Jetson devices (CUDA capabilities < 10.1) are not supported by NCCL";
142        assertion =
143          !hasJetsonCudaCapability || all (flip versionAtLeast "10.1") requestedJetsonCudaCapabilities;
144      }
145    ];
146
147    updateScript = gitUpdater {
148      inherit (finalAttrs) pname version;
149      rev-prefix = "v";
150    };
151  };
152
153  meta = {
154    description = "Multi-GPU and multi-node collective communication primitives for NVIDIA GPUs";
155    homepage = "https://developer.nvidia.com/nccl";
156    license = licenses.bsd3;
157    platforms = [
158      "aarch64-linux"
159      "x86_64-linux"
160    ];
161    # NCCL is not supported on Pre-Thor Jetsons, because it does not use NVLink or PCI-e for inter-GPU communication.
162    # https://forums.developer.nvidia.com/t/can-jetson-orin-support-nccl/232845/9
163    badPlatforms = _mkMetaBadPlatforms finalAttrs;
164    maintainers = with maintainers; [
165      mdaiter
166    ];
167    teams = [ teams.cuda ];
168  };
169})