Merge pull request #274319 from ConnorBaker/feat/cudaPackages-all-packages-eval

tree-wide: cudaPackages attributes should not cause default eval to fail

authored by Someone and committed by GitHub e529aea8 6f2d8066

Changed files
+131 -106
pkgs
applications
science
math
caffe
development
cuda-modules
libraries
science
math
magma
xgboost
python-modules
jaxlib
torch
top-level
+1 -1
pkgs/applications/science/math/caffe/default.nix
··· 153 153 || cudaSupport 154 154 || !(leveldbSupport -> (leveldb != null && snappy != null)) 155 155 || !(cudnnSupport -> (hasCudnn && cudaSupport)) 156 - || !(ncclSupport -> cudaSupport) 156 + || !(ncclSupport -> (cudaSupport && !nccl.meta.unsupported)) 157 157 || !(pythonSupport -> (python != null && numpy != null)) 158 158 ; 159 159 license = licenses.bsd2;
+1 -1
pkgs/development/cuda-modules/cuda/overrides.nix
··· 72 72 env.autoPatchelfIgnoreMissingDeps = 73 73 prevAttrs.env.autoPatchelfIgnoreMissingDeps + " libnvrm_gpu.so libnvrm_mem.so libnvdla_runtime.so"; 74 74 # `cuda_compat` only works on aarch64-linux, and only when building for Jetson devices. 75 - brokenConditions = prevAttrs.brokenConditions // { 75 + badPlatformsConditions = prevAttrs.badPlatformsConditions // { 76 76 "Trying to use cuda_compat on aarch64-linux targeting non-Jetson devices" = 77 77 !final.flags.isJetsonBuild; 78 78 };
+13 -5
pkgs/development/cuda-modules/cudnn/shims.nix
··· 1 1 # Shims to mimic the shape of ../modules/generic/manifests/{feature,redistrib}/release.nix 2 - {package, redistArch}: 3 2 { 4 - featureRelease.${redistArch}.outputs = { 5 - lib = true; 6 - static = true; 7 - dev = true; 3 + lib, 4 + package, 5 + # redistArch :: String 6 + # String is "unsupported" if the given architecture is unsupported. 7 + redistArch, 8 + }: 9 + { 10 + featureRelease = lib.optionalAttrs (redistArch != "unsupported") { 11 + ${redistArch}.outputs = { 12 + lib = true; 13 + static = true; 14 + dev = true; 15 + }; 8 16 }; 9 17 redistribRelease = { 10 18 name = "NVIDIA CUDA Deep Neural Network library (cuDNN)";
+1
pkgs/development/cuda-modules/cutensor/extension.nix
··· 92 92 # A release is supported if it has a libPath that matches our CUDA version for our platform. 93 93 # LibPath are not constant across the same release -- one platform may support fewer 94 94 # CUDA versions than another. 95 + # redistArch :: String 95 96 redistArch = flags.getRedistArch hostPlatform.system; 96 97 # platformIsSupported :: Manifests -> Boolean 97 98 platformIsSupported =
+20 -30
pkgs/development/cuda-modules/flags.nix
··· 131 131 # `linux-aarch64` redist (which is for Jetson devices) if we're building any Jetson devices. 132 132 # Since both are based on aarch64, we can only have one or the other, otherwise there's an 133 133 # ambiguity as to which should be used. 134 + # NOTE: This function *will* be called by unsupported systems because `cudaPackages` is part of 135 + # `all-packages.nix`, which is evaluated on all systems. As such, we need to handle unsupported 136 + # systems gracefully. 134 137 # getRedistArch :: String -> String 135 - getRedistArch = 136 - nixSystem: 137 - if nixSystem == "aarch64-linux" then 138 - if jetsonTargets != [] then "linux-aarch64" else "linux-sbsa" 139 - else if nixSystem == "x86_64-linux" then 140 - "linux-x86_64" 141 - else if nixSystem == "ppc64le-linux" then 142 - "linux-ppc64le" 143 - else if nixSystem == "x86_64-windows" then 144 - "windows-x86_64" 145 - else 146 - "unsupported"; 138 + getRedistArch = nixSystem: attrsets.attrByPath [ nixSystem ] "unsupported" { 139 + aarch64-linux = if jetsonTargets != [] then "linux-aarch64" else "linux-sbsa"; 140 + x86_64-linux = "linux-x86_64"; 141 + ppc64le-linux = "linux-ppc64le"; 142 + x86_64-windows = "windows-x86_64"; 143 + }; 147 144 148 145 # Maps NVIDIA redist arch to Nix system. 149 - # It is imperative that we include the boolean condition based on jetsonTargets to ensure 150 - # we don't advertise availability of packages only available on server-grade ARM 151 - # as being available for the Jetson, since both `linux-sbsa` and `linux-aarch64` are 152 - # mapped to the Nix system `aarch64-linux`. 153 - getNixSystem = 154 - redistArch: 155 - if redistArch == "linux-sbsa" && jetsonTargets == [] then 156 - "aarch64-linux" 157 - else if redistArch == "linux-aarch64" && jetsonTargets != [] then 158 - "aarch64-linux" 159 - else if redistArch == "linux-x86_64" then 160 - "x86_64-linux" 161 - else if redistArch == "linux-ppc64le" then 162 - "ppc64le-linux" 163 - else if redistArch == "windows-x86_64" then 164 - "x86_64-windows" 165 - else 166 - "unsupported-${redistArch}"; 146 + # NOTE: This function *will* be called by unsupported systems because `cudaPackages` is part of 147 + # `all-packages.nix`, which is evaluated on all systems. As such, we need to handle unsupported 148 + # systems gracefully. 149 + # getNixSystem :: String -> String 150 + getNixSystem = redistArch: attrsets.attrByPath [ redistArch ] "unsupported-${redistArch}" { 151 + linux-sbsa = "aarch64-linux"; 152 + linux-aarch64 = "aarch64-linux"; 153 + linux-x86_64 = "x86_64-linux"; 154 + linux-ppc64le = "ppc64le-linux"; 155 + windows-x86_64 = "x86_64-windows"; 156 + }; 167 157 168 158 formatCapabilities = 169 159 {
+50 -25
pkgs/development/cuda-modules/generic-builders/manifest.nix
··· 43 43 # Get the redist architectures for which package provides distributables. 44 44 # These are used by meta.platforms. 45 45 supportedRedistArchs = builtins.attrNames featureRelease; 46 + # redistArch :: String 47 + # The redistArch is the name of the architecture for which the redistributable is built. 48 + # It is `"unsupported"` if the redistributable is not supported on the target platform. 46 49 redistArch = flags.getRedistArch hostPlatform.system; 47 50 in 48 51 backendStdenv.mkDerivation ( ··· 87 90 "sample" 88 91 "python" 89 92 ]; 93 + # Filter out outputs that don't exist in the redistributable. 94 + # NOTE: In the case the redistributable isn't supported on the target platform, 95 + # we will have `outputs = [ "out" ] ++ possibleOutputs`. This is of note because platforms which 96 + # aren't supported would otherwise have evaluation errors when trying to access outputs other than `out`. 97 + # The alternative would be to have `outputs = [ "out" ]` when`redistArch = "unsupported"`, but that would 98 + # require adding guards throughout the entirety of the CUDA package set to ensure `cudaSupport` is true -- 99 + # recall that OfBorg will evaluate packages marked as broken and that `cudaPackages` will be evaluated with 100 + # `cudaSupport = false`! 90 101 additionalOutputs = 91 - if redistArch == "unsupported" then possibleOutputs else builtins.filter hasOutput possibleOutputs; 102 + if redistArch == "unsupported" 103 + then possibleOutputs 104 + else builtins.filter hasOutput possibleOutputs; 92 105 # The out output is special -- it's the default output and we always include it. 93 106 outputs = [ "out" ] ++ additionalOutputs; 94 107 in ··· 112 125 python = ["**/*.whl"]; 113 126 }; 114 127 115 - # Useful for introspecting why something went wrong. 116 - # Maps descriptions of why the derivation would be marked broken to 117 - # booleans indicating whether that description is true. 118 - brokenConditions = {}; 128 + # Useful for introspecting why something went wrong. Maps descriptions of why the derivation would be marked as 129 + # broken on have badPlatforms include the current platform. 119 130 120 - src = fetchurl { 121 - url = 122 - if (builtins.hasAttr redistArch redistribRelease) then 123 - "https://developer.download.nvidia.com/compute/${redistName}/redist/${ 124 - redistribRelease.${redistArch}.relative_path 125 - }" 126 - else 127 - "cannot-construct-an-url-for-the-${redistArch}-platform"; 128 - sha256 = redistribRelease.${redistArch}.sha256 or lib.fakeHash; 129 - }; 131 + # brokenConditions :: AttrSet Bool 132 + # Sets `meta.broken = true` if any of the conditions are true. 133 + # Example: Broken on a specific version of CUDA or when a dependency has a specific version. 134 + brokenConditions = { }; 135 + 136 + # badPlatformsConditions :: AttrSet Bool 137 + # Sets `meta.badPlatforms = meta.platforms` if any of the conditions are true. 138 + # Example: Broken on a specific architecture when some condition is met (like targeting Jetson). 139 + badPlatformsConditions = { }; 140 + 141 + # src :: Optional Derivation 142 + src = trivial.pipe redistArch [ 143 + # If redistArch doesn't exist in redistribRelease, return null. 144 + (redistArch: redistribRelease.${redistArch} or null) 145 + # If the release is non-null, fetch the source; otherwise, return null. 146 + (trivial.mapNullable ( 147 + { relative_path, sha256, ... }: 148 + fetchurl { 149 + url = "https://developer.download.nvidia.com/compute/${redistName}/redist/${relative_path}"; 150 + inherit sha256; 151 + } 152 + )) 153 + ]; 130 154 131 155 # Handle the pkg-config files: 132 156 # 1. No FHS ··· 297 321 meta = { 298 322 description = "${redistribRelease.name}. By downloading and using the packages you accept the terms and conditions of the ${finalAttrs.meta.license.shortName}"; 299 323 sourceProvenance = [sourceTypes.binaryNativeCode]; 300 - platforms = 301 - lists.concatMap 302 - ( 303 - redistArch: 304 - let 305 - nixSystem = flags.getNixSystem redistArch; 306 - in 307 - lists.optionals (!(strings.hasPrefix "unsupported-" nixSystem)) [ nixSystem ] 308 - ) 309 - supportedRedistArchs; 310 324 broken = lists.any trivial.id (attrsets.attrValues finalAttrs.brokenConditions); 325 + platforms = trivial.pipe supportedRedistArchs [ 326 + # Map each redist arch to the equivalent nix system or null if there is no equivalent. 327 + (builtins.map flags.getNixSystem) 328 + # Filter out unsupported systems 329 + (builtins.filter (nixSystem: !(strings.hasPrefix "unsupported-" nixSystem))) 330 + ]; 331 + badPlatforms = 332 + let 333 + isBadPlatform = lists.any trivial.id (attrsets.attrValues finalAttrs.badPlatformsConditions); 334 + in 335 + lists.optionals isBadPlatform finalAttrs.meta.platforms; 311 336 license = licenses.unfree; 312 337 maintainers = teams.cuda.members; 313 338 # Force the use of the default, fat output by default (even though `dev` exists, which
+6 -12
pkgs/development/cuda-modules/generic-builders/multiplex.nix
··· 20 20 # The featureRelease is used to populate meta.platforms (by way of looking at the attribute names) 21 21 # and to determine the outputs of the package. 22 22 # shimFn :: {package, redistArch} -> AttrSet 23 - shimsFn ? ({package, redistArch}: throw "shimsFn must be provided"), 23 + shimsFn ? (throw "shimsFn must be provided"), 24 24 # fixupFn :: Path 25 25 # A path (or nix expression) to be evaluated with callPackage and then 26 26 # provided to the package's overrideAttrs function. ··· 29 29 # - cudaVersion 30 30 # - mkVersionedPackageName 31 31 # - package 32 - fixupFn ? ( 33 - { 34 - final, 35 - cudaVersion, 36 - mkVersionedPackageName, 37 - package, 38 - ... 39 - }: 40 - throw "fixupFn must be provided" 41 - ), 32 + # - ... 33 + fixupFn ? (throw "fixupFn must be provided"), 42 34 }: 43 35 let 44 36 inherit (lib) ··· 80 72 && strings.versionAtLeast package.maxCudaVersion cudaVersion; 81 73 82 74 # Get all of the packages for our given platform. 75 + # redistArch :: String 76 + # Value is `"unsupported"` if the platform is not supported. 83 77 redistArch = flags.getRedistArch hostPlatform.system; 84 78 85 - allReleases = builtins.concatMap (xs: xs) (builtins.attrValues releaseSets); 79 + allReleases = lists.flatten (builtins.attrValues releaseSets); 86 80 87 81 # All the supported packages we can build for our platform. 88 82 # perSystemReleases :: List Package
+3
pkgs/development/cuda-modules/nccl/default.nix
··· 100 100 homepage = "https://developer.nvidia.com/nccl"; 101 101 license = licenses.bsd3; 102 102 platforms = platforms.linux; 103 + # NCCL is not supported on Jetson, because it does not use NVLink or PCI-e for inter-GPU communication. 104 + # https://forums.developer.nvidia.com/t/can-jetson-orin-support-nccl/232845/9 105 + badPlatforms = lib.optionals cudaFlags.isJetsonBuild [ "aarch64-linux" ]; 103 106 maintainers = 104 107 with maintainers; 105 108 [
+7 -8
pkgs/development/cuda-modules/tensorrt/fixup.nix
··· 11 11 }: 12 12 let 13 13 inherit (lib) 14 + attrsets 14 15 maintainers 15 16 meta 16 17 strings 17 18 versions 18 19 ; 19 - targetArch = 20 - if hostPlatform.isx86_64 then 21 - "x86_64-linux-gnu" 22 - else if hostPlatform.isAarch64 then 23 - "aarch64-linux-gnu" 24 - else 25 - "unsupported"; 20 + # targetArch :: String 21 + targetArch = attrsets.attrByPath [ hostPlatform.system ] "unsupported" { 22 + x86_64-linux = "x86_64-linux-gnu"; 23 + aarch64-linux = "aarch64-linux-gnu"; 24 + }; 26 25 in 27 26 finalAttrs: prevAttrs: { 28 27 # Useful for inspecting why something went wrong. ··· 69 68 70 69 preInstall = 71 70 (prevAttrs.preInstall or "") 72 - + '' 71 + + strings.optionalString (targetArch != "unsupported") '' 73 72 # Replace symlinks to bin and lib with the actual directories from targets. 74 73 for dir in bin lib; do 75 74 rm "$dir"
+16 -8
pkgs/development/cuda-modules/tensorrt/shims.nix
··· 1 1 # Shims to mimic the shape of ../modules/generic/manifests/{feature,redistrib}/release.nix 2 - {package, redistArch}: 3 2 { 4 - featureRelease.${redistArch}.outputs = { 5 - bin = true; 6 - lib = true; 7 - static = true; 8 - dev = true; 9 - sample = true; 10 - python = true; 3 + lib, 4 + package, 5 + # redistArch :: String 6 + # String is `"unsupported"` if the given architecture is unsupported. 7 + redistArch, 8 + }: 9 + { 10 + featureRelease = lib.optionalAttrs (redistArch != "unsupported") { 11 + ${redistArch}.outputs = { 12 + bin = true; 13 + lib = true; 14 + static = true; 15 + dev = true; 16 + sample = true; 17 + python = true; 18 + }; 11 19 }; 12 20 redistribRelease = { 13 21 name = "TensorRT: a high-performance deep learning interface";
+1 -1
pkgs/development/libraries/science/math/magma/generic.nix
··· 159 159 description = "Matrix Algebra on GPU and Multicore Architectures"; 160 160 license = licenses.bsd3; 161 161 homepage = "http://icl.cs.utk.edu/magma/index.html"; 162 - platforms = platforms.unix; 162 + platforms = platforms.linux; 163 163 maintainers = with maintainers; [ connorbaker ]; 164 164 165 165 # Cf. https://bitbucket.org/icl/magma/src/fcfe5aa61c1a4c664b36a73ebabbdbab82765e9f/CMakeLists.txt#lines-20
+1 -1
pkgs/development/libraries/xgboost/default.nix
··· 14 14 , rPackages 15 15 }@inputs: 16 16 17 - assert ncclSupport -> cudaSupport; 17 + assert ncclSupport -> (cudaSupport && !cudaPackages.nccl.meta.unsupported); 18 18 # Disable regular tests when building the R package 19 19 # because 1) the R package runs its own tests and 20 20 # 2) the R package creates a different binary shared
+2 -1
pkgs/development/python-modules/jaxlib/default.nix
··· 64 64 # aarch64-darwin is broken because of https://github.com/bazelbuild/rules_cc/pull/136 65 65 # however even with that fix applied, it doesn't work for everyone: 66 66 # https://github.com/NixOS/nixpkgs/pull/184395#issuecomment-1207287129 67 - broken = stdenv.isDarwin; 67 + # NOTE: We always build with NCCL; if it is unsupported, then our build is broken. 68 + broken = stdenv.isDarwin || nccl.meta.unsupported; 68 69 }; 69 70 70 71 cudatoolkit_joined = symlinkJoin {
+8 -5
pkgs/development/python-modules/torch/default.nix
··· 7 7 magma, 8 8 magma-hip, 9 9 magma-cuda-static, 10 - useSystemNccl ? true, 10 + # Use the system NCCL as long as we're targeting CUDA on a supported platform. 11 + useSystemNccl ? (cudaSupport && !cudaPackages.nccl.meta.unsupported), 11 12 MPISupport ? false, mpi, 12 13 buildDocs ? false, 13 14 ··· 273 274 PYTORCH_BUILD_VERSION = version; 274 275 PYTORCH_BUILD_NUMBER = 0; 275 276 276 - USE_NCCL = setBool (cudaSupport && cudaPackages ? nccl); 277 - USE_SYSTEM_NCCL = setBool useSystemNccl; # don't build pytorch's third_party NCCL 278 - USE_STATIC_NCCL = setBool useSystemNccl; 277 + # In-tree builds of NCCL are not supported. 278 + # Use NCCL when cudaSupport is enabled and nccl is available. 279 + USE_NCCL = setBool useSystemNccl; 280 + USE_SYSTEM_NCCL = USE_NCCL; 281 + USE_STATIC_NCCL = USE_NCCL; 279 282 280 283 # Suppress a weird warning in mkl-dnn, part of ideep in pytorch 281 284 # (upstream seems to have fixed this in the wrong place?) ··· 363 366 ] ++ lists.optionals (cudaPackages ? cudnn) [ 364 367 cudnn.dev 365 368 cudnn.lib 366 - ] ++ lists.optionals (useSystemNccl && cudaPackages ? nccl) [ 369 + ] ++ lists.optionals useSystemNccl [ 367 370 # Some platforms do not support NCCL (i.e., Jetson) 368 371 nccl.dev # Provides nccl.h AND a static copy of NCCL! 369 372 ] ++ lists.optionals (strings.versionOlder cudaVersion "11.8") [
+1 -8
pkgs/top-level/cuda-packages.nix
··· 72 72 73 73 # Loose packages 74 74 cudatoolkit = final.callPackage ../development/cuda-modules/cudatoolkit {}; 75 - # SaxPy is only available after 11.4 because it requires redistributable versions of CUDA libraries. 76 - saxpy = attrsets.optionalAttrs (strings.versionAtLeast cudaVersion "11.4") ( 77 - final.callPackage ../development/cuda-modules/saxpy {} 78 - ); 79 - } 80 - # NCCL is not supported on Jetson, because it does not use NVLink or PCI-e for inter-GPU communication. 81 - # https://forums.developer.nvidia.com/t/can-jetson-orin-support-nccl/232845/9 82 - // attrsets.optionalAttrs (!flags.isJetsonBuild) { 75 + saxpy = final.callPackage ../development/cuda-modules/saxpy {}; 83 76 nccl = final.callPackage ../development/cuda-modules/nccl {}; 84 77 nccl-tests = final.callPackage ../development/cuda-modules/nccl-tests {}; 85 78 }