pkgs/development/python-modules/torch/default.nix at 23.05-pre · pyrox.dev/nixpkgs

pyrox.dev / nixpkgs
lol
nixpkgs / pkgs / development / python-modules / torch / default.nix
at 23.05-pre 12 kB view raw
  1{ stdenv, lib, fetchFromGitHub, fetchpatch, buildPythonPackage, python,
  2  cudaSupport ? false, cudaPackages, magma,
  3  mklDnnSupport ? true, useSystemNccl ? true,
  4  MPISupport ? false, mpi,
  5  buildDocs ? false,
  6  cudaArchList ? null,
  7
  8  # Native build inputs
  9  cmake, util-linux, linkFarm, symlinkJoin, which, pybind11, removeReferencesTo,
 10
 11  # Build inputs
 12  numactl,
 13  CoreServices, libobjc,
 14
 15  # Propagated build inputs
 16  numpy, pyyaml, cffi, click, typing-extensions,
 17
 18  # Unit tests
 19  hypothesis, psutil,
 20
 21  # virtual pkg that consistently instantiates blas across nixpkgs
 22  # See https://github.com/NixOS/nixpkgs/pull/83888
 23  blas,
 24
 25  # ninja (https://ninja-build.org) must be available to run C++ extensions tests,
 26  ninja,
 27
 28  linuxHeaders_5_19,
 29
 30  # dependencies for torch.utils.tensorboard
 31  pillow, six, future, tensorboard, protobuf,
 32
 33  isPy3k, pythonOlder }:
 34
 35let
 36  inherit (cudaPackages) cudatoolkit cudnn nccl;
 37in
 38
 39# assert that everything needed for cuda is present and that the correct cuda versions are used
 40assert !cudaSupport || (let majorIs = lib.versions.major cudatoolkit.version;
 41                        in majorIs == "9" || majorIs == "10" || majorIs == "11");
 42
 43# confirm that cudatoolkits are sync'd across dependencies
 44assert !(MPISupport && cudaSupport) || mpi.cudatoolkit == cudatoolkit;
 45assert !cudaSupport || magma.cudatoolkit == cudatoolkit;
 46
 47let
 48  setBool = v: if v then "1" else "0";
 49  cudatoolkit_joined = symlinkJoin {
 50    name = "${cudatoolkit.name}-unsplit";
 51    # nccl is here purely for semantic grouping it could be moved to nativeBuildInputs
 52    paths = [ cudatoolkit.out cudatoolkit.lib nccl.dev nccl.out ];
 53  };
 54
 55  # Give an explicit list of supported architectures for the build, See:
 56  # - pytorch bug report: https://github.com/pytorch/pytorch/issues/23573
 57  # - pytorch-1.2.0 build on nixpks: https://github.com/NixOS/nixpkgs/pull/65041
 58  #
 59  # This list was selected by omitting the TORCH_CUDA_ARCH_LIST parameter,
 60  # observing the fallback option (which selected all architectures known
 61  # from cudatoolkit_10_0, pytorch-1.2, and python-3.6), and doing a binary
 62  # searching to find offending architectures.
 63  #
 64  # NOTE: Because of sandboxing, this derivation can't auto-detect the hardware's
 65  # cuda architecture, so there is also now a problem around new architectures
 66  # not being supported until explicitly added to this derivation.
 67  #
 68  # FIXME: CMake is throwing the following warning on python-1.2:
 69  #
 70  # ```
 71  # CMake Warning at cmake/public/utils.cmake:172 (message):
 72  #   In the future we will require one to explicitly pass TORCH_CUDA_ARCH_LIST
 73  #   to cmake instead of implicitly setting it as an env variable.  This will
 74  #   become a FATAL_ERROR in future version of pytorch.
 75  # ```
 76  # If this is causing problems for your build, this derivation may have to strip
 77  # away the standard `buildPythonPackage` and use the
 78  # [*Adjust Build Options*](https://github.com/pytorch/pytorch/tree/v1.2.0#adjust-build-options-optional)
 79  # instructions. This will also add more flexibility around configurations
 80  # (allowing FBGEMM to be built in pytorch-1.1), and may future proof this
 81  # derivation.
 82  brokenArchs = [ "3.0" ]; # this variable is only used as documentation.
 83
 84  cudaCapabilities = rec {
 85    cuda9 = [
 86      "3.5"
 87      "5.0"
 88      "5.2"
 89      "6.0"
 90      "6.1"
 91      "7.0"
 92      "7.0+PTX"  # I am getting a "undefined architecture compute_75" on cuda 9
 93                 # which leads me to believe this is the final cuda-9-compatible architecture.
 94    ];
 95
 96    cuda10 = cuda9 ++ [
 97      "7.5"
 98      "7.5+PTX"  # < most recent architecture as of cudatoolkit_10_0 and pytorch-1.2.0
 99    ];
100
101    cuda11 = cuda10 ++ [
102      "8.0"
103      "8.0+PTX"  # < CUDA toolkit 11.0
104      "8.6"
105      "8.6+PTX"  # < CUDA toolkit 11.1
106    ];
107  };
108  final_cudaArchList =
109    if !cudaSupport || cudaArchList != null
110    then cudaArchList
111    else cudaCapabilities."cuda${lib.versions.major cudatoolkit.version}";
112
113  # Normally libcuda.so.1 is provided at runtime by nvidia-x11 via
114  # LD_LIBRARY_PATH=/run/opengl-driver/lib.  We only use the stub
115  # libcuda.so from cudatoolkit for running tests, so that we don’t have
116  # to recompile pytorch on every update to nvidia-x11 or the kernel.
117  cudaStub = linkFarm "cuda-stub" [{
118    name = "libcuda.so.1";
119    path = "${cudatoolkit}/lib/stubs/libcuda.so";
120  }];
121  cudaStubEnv = lib.optionalString cudaSupport
122    "LD_LIBRARY_PATH=${cudaStub}\${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH ";
123
124in buildPythonPackage rec {
125  pname = "torch";
126  # Don't forget to update torch-bin to the same version.
127  version = "1.12.1";
128  format = "setuptools";
129
130  disabled = pythonOlder "3.7.0";
131
132  outputs = [
133    "out" # output standard python package
134    "dev" # output libtorch headers
135    "lib" # output libtorch libraries
136  ];
137
138  src = fetchFromGitHub {
139    owner = "pytorch";
140    repo = "pytorch";
141    rev = "refs/tags/v${version}";
142    fetchSubmodules = true;
143    hash = "sha256-8378BVOBFCRYRG1+yIYFSPKmb1rFOLgR+8pNZKt9NfI=";
144  };
145
146  patches = lib.optionals (stdenv.isDarwin && stdenv.isx86_64) [
147    # pthreadpool added support for Grand Central Dispatch in April
148    # 2020. However, this relies on functionality (DISPATCH_APPLY_AUTO)
149    # that is available starting with macOS 10.13. However, our current
150    # base is 10.12. Until we upgrade, we can fall back on the older
151    # pthread support.
152    ./pthreadpool-disable-gcd.diff
153  ];
154
155  preConfigure = lib.optionalString cudaSupport ''
156    export TORCH_CUDA_ARCH_LIST="${lib.strings.concatStringsSep ";" final_cudaArchList}"
157    export CC=${cudatoolkit.cc}/bin/gcc CXX=${cudatoolkit.cc}/bin/g++
158  '' + lib.optionalString (cudaSupport && cudnn != null) ''
159    export CUDNN_INCLUDE_DIR=${cudnn}/include
160  '';
161
162  # Use pytorch's custom configurations
163  dontUseCmakeConfigure = true;
164
165  BUILD_NAMEDTENSOR = setBool true;
166  BUILD_DOCS = setBool buildDocs;
167
168  # We only do an imports check, so do not build tests either.
169  BUILD_TEST = setBool false;
170
171  # Unlike MKL, oneDNN (née MKLDNN) is FOSS, so we enable support for
172  # it by default. PyTorch currently uses its own vendored version
173  # of oneDNN through Intel iDeep.
174  USE_MKLDNN = setBool mklDnnSupport;
175  USE_MKLDNN_CBLAS = setBool mklDnnSupport;
176
177  # Avoid using pybind11 from git submodule
178  # Also avoids pytorch exporting the headers of pybind11
179  USE_SYSTEM_BIND11 = true;
180
181  preBuild = ''
182    export MAX_JOBS=$NIX_BUILD_CORES
183    ${python.interpreter} setup.py build --cmake-only
184    ${cmake}/bin/cmake build
185  '';
186
187  preFixup = ''
188    function join_by { local IFS="$1"; shift; echo "$*"; }
189    function strip2 {
190      IFS=':'
191      read -ra RP <<< $(patchelf --print-rpath $1)
192      IFS=' '
193      RP_NEW=$(join_by : ''${RP[@]:2})
194      patchelf --set-rpath \$ORIGIN:''${RP_NEW} "$1"
195    }
196    for f in $(find ''${out} -name 'libcaffe2*.so')
197    do
198      strip2 $f
199    done
200  '';
201
202  # Override the (weirdly) wrong version set by default. See
203  # https://github.com/NixOS/nixpkgs/pull/52437#issuecomment-449718038
204  # https://github.com/pytorch/pytorch/blob/v1.0.0/setup.py#L267
205  PYTORCH_BUILD_VERSION = version;
206  PYTORCH_BUILD_NUMBER = 0;
207
208  USE_SYSTEM_NCCL = setBool useSystemNccl;                  # don't build pytorch's third_party NCCL
209
210  # Suppress a weird warning in mkl-dnn, part of ideep in pytorch
211  # (upstream seems to have fixed this in the wrong place?)
212  # https://github.com/intel/mkl-dnn/commit/8134d346cdb7fe1695a2aa55771071d455fae0bc
213  # https://github.com/pytorch/pytorch/issues/22346
214  #
215  # Also of interest: pytorch ignores CXXFLAGS uses CFLAGS for both C and C++:
216  # https://github.com/pytorch/pytorch/blob/v1.11.0/setup.py#L17
217  NIX_CFLAGS_COMPILE = lib.optionals (blas.implementation == "mkl") [ "-Wno-error=array-bounds" ];
218
219  nativeBuildInputs = [
220    cmake
221    util-linux
222    which
223    ninja
224    pybind11
225    removeReferencesTo
226  ] ++ lib.optionals cudaSupport [ cudatoolkit_joined ];
227
228  buildInputs = [ blas blas.provider pybind11 ]
229    ++ [ linuxHeaders_5_19 ] # TMP: avoid "flexible array member" errors for now
230    ++ lib.optionals cudaSupport [ cudnn magma nccl ]
231    ++ lib.optionals stdenv.isLinux [ numactl ]
232    ++ lib.optionals stdenv.isDarwin [ CoreServices libobjc ];
233
234  propagatedBuildInputs = [
235    cffi
236    click
237    numpy
238    pyyaml
239    typing-extensions
240    # the following are required for tensorboard support
241    pillow six future tensorboard protobuf
242  ] ++ lib.optionals MPISupport [ mpi ];
243
244  # Tests take a long time and may be flaky, so just sanity-check imports
245  doCheck = false;
246
247  pythonImportsCheck = [
248    "torch"
249  ];
250
251  checkInputs = [ hypothesis ninja psutil ];
252
253  checkPhase = with lib.versions; with lib.strings; concatStringsSep " " [
254    "runHook preCheck"
255    cudaStubEnv
256    "${python.interpreter} test/run_test.py"
257    "--exclude"
258    (concatStringsSep " " [
259      "utils" # utils requires git, which is not allowed in the check phase
260
261      # "dataloader" # psutils correctly finds and triggers multiprocessing, but is too sandboxed to run -- resulting in numerous errors
262      # ^^^^^^^^^^^^ NOTE: while test_dataloader does return errors, these are acceptable errors and do not interfere with the build
263
264      # tensorboard has acceptable failures for pytorch 1.3.x due to dependencies on tensorboard-plugins
265      (optionalString (majorMinor version == "1.3" ) "tensorboard")
266    ])
267    "runHook postCheck"
268  ];
269
270  postInstall = ''
271    find "$out/${python.sitePackages}/torch/include" "$out/${python.sitePackages}/torch/lib" -type f -exec remove-references-to -t ${stdenv.cc} '{}' +
272
273    mkdir $dev
274    cp -r $out/${python.sitePackages}/torch/include $dev/include
275    cp -r $out/${python.sitePackages}/torch/share $dev/share
276
277    # Fix up library paths for split outputs
278    substituteInPlace \
279      $dev/share/cmake/Torch/TorchConfig.cmake \
280      --replace \''${TORCH_INSTALL_PREFIX}/lib "$lib/lib"
281
282    substituteInPlace \
283      $dev/share/cmake/Caffe2/Caffe2Targets-release.cmake \
284      --replace \''${_IMPORT_PREFIX}/lib "$lib/lib"
285
286    mkdir $lib
287    mv $out/${python.sitePackages}/torch/lib $lib/lib
288    ln -s $lib/lib $out/${python.sitePackages}/torch/lib
289  '';
290
291  postFixup = lib.optionalString stdenv.isDarwin ''
292    for f in $(ls $lib/lib/*.dylib); do
293        install_name_tool -id $lib/lib/$(basename $f) $f || true
294    done
295
296    install_name_tool -change @rpath/libshm.dylib $lib/lib/libshm.dylib $lib/lib/libtorch_python.dylib
297    install_name_tool -change @rpath/libtorch.dylib $lib/lib/libtorch.dylib $lib/lib/libtorch_python.dylib
298    install_name_tool -change @rpath/libc10.dylib $lib/lib/libc10.dylib $lib/lib/libtorch_python.dylib
299
300    install_name_tool -change @rpath/libc10.dylib $lib/lib/libc10.dylib $lib/lib/libtorch.dylib
301
302    install_name_tool -change @rpath/libtorch.dylib $lib/lib/libtorch.dylib $lib/lib/libshm.dylib
303    install_name_tool -change @rpath/libc10.dylib $lib/lib/libc10.dylib $lib/lib/libshm.dylib
304  '';
305
306  # Builds in 2+h with 2 cores, and ~15m with a big-parallel builder.
307  requiredSystemFeatures = [ "big-parallel" ];
308
309  passthru = {
310    inherit cudaSupport cudaPackages;
311    cudaArchList = final_cudaArchList;
312    # At least for 1.10.2 `torch.fft` is unavailable unless BLAS provider is MKL. This attribute allows for easy detection of its availability.
313    blasProvider = blas.provider;
314  };
315
316  meta = with lib; {
317    changelog = "https://github.com/pytorch/pytorch/releases/tag/v${version}";
318    # keep PyTorch in the description so the package can be found under that name on search.nixos.org
319    description = "PyTorch: Tensors and Dynamic neural networks in Python with strong GPU acceleration";
320    homepage = "https://pytorch.org/";
321    license = licenses.bsd3;
322    maintainers = with maintainers; [ teh thoughtpolice tscholak ]; # tscholak esp. for darwin-related builds
323    platforms = with platforms; linux ++ lib.optionals (!cudaSupport) darwin;
324  };
325}