pkgs/development/python-modules/pytorch/default.nix at 22.05

tjh.dev / nixpkgs
fork atom
nixpkgs mirror (for testing) github.com/NixOS/nixpkgs
nix
fork atom
nixpkgs / pkgs / development / python-modules / pytorch / default.nix
at 22.05 326 lines 12 kB view raw
wrap content
  1{ stdenv, lib, fetchFromGitHub, fetchpatch, buildPythonPackage, python,
  2  cudaSupport ? false, cudaPackages, magma,
  3  mklDnnSupport ? true, useSystemNccl ? true,
  4  MPISupport ? false, mpi,
  5  buildDocs ? false,
  6  cudaArchList ? null,
  7
  8  # Native build inputs
  9  cmake, util-linux, linkFarm, symlinkJoin, which, pybind11, removeReferencesTo,
 10
 11  # Build inputs
 12  numactl,
 13
 14  # Propagated build inputs
 15  numpy, pyyaml, cffi, click, typing-extensions,
 16
 17  # Unit tests
 18  hypothesis, psutil,
 19
 20  # virtual pkg that consistently instantiates blas across nixpkgs
 21  # See https://github.com/NixOS/nixpkgs/pull/83888
 22  blas,
 23
 24  # ninja (https://ninja-build.org) must be available to run C++ extensions tests,
 25  ninja,
 26
 27  # dependencies for torch.utils.tensorboard
 28  pillow, six, future, tensorboard, protobuf,
 29
 30  isPy3k, pythonOlder }:
 31
 32let
 33  inherit (cudaPackages) cudatoolkit cudnn nccl;
 34in
 35
 36# assert that everything needed for cuda is present and that the correct cuda versions are used
 37assert !cudaSupport || (let majorIs = lib.versions.major cudatoolkit.version;
 38                        in majorIs == "9" || majorIs == "10" || majorIs == "11");
 39
 40# confirm that cudatoolkits are sync'd across dependencies
 41assert !(MPISupport && cudaSupport) || mpi.cudatoolkit == cudatoolkit;
 42assert !cudaSupport || magma.cudatoolkit == cudatoolkit;
 43
 44let
 45  setBool = v: if v then "1" else "0";
 46  cudatoolkit_joined = symlinkJoin {
 47    name = "${cudatoolkit.name}-unsplit";
 48    # nccl is here purely for semantic grouping it could be moved to nativeBuildInputs
 49    paths = [ cudatoolkit.out cudatoolkit.lib nccl.dev nccl.out ];
 50  };
 51
 52  # Give an explicit list of supported architectures for the build, See:
 53  # - pytorch bug report: https://github.com/pytorch/pytorch/issues/23573
 54  # - pytorch-1.2.0 build on nixpks: https://github.com/NixOS/nixpkgs/pull/65041
 55  #
 56  # This list was selected by omitting the TORCH_CUDA_ARCH_LIST parameter,
 57  # observing the fallback option (which selected all architectures known
 58  # from cudatoolkit_10_0, pytorch-1.2, and python-3.6), and doing a binary
 59  # searching to find offending architectures.
 60  #
 61  # NOTE: Because of sandboxing, this derivation can't auto-detect the hardware's
 62  # cuda architecture, so there is also now a problem around new architectures
 63  # not being supported until explicitly added to this derivation.
 64  #
 65  # FIXME: CMake is throwing the following warning on python-1.2:
 66  #
 67  # ```
 68  # CMake Warning at cmake/public/utils.cmake:172 (message):
 69  #   In the future we will require one to explicitly pass TORCH_CUDA_ARCH_LIST
 70  #   to cmake instead of implicitly setting it as an env variable.  This will
 71  #   become a FATAL_ERROR in future version of pytorch.
 72  # ```
 73  # If this is causing problems for your build, this derivation may have to strip
 74  # away the standard `buildPythonPackage` and use the
 75  # [*Adjust Build Options*](https://github.com/pytorch/pytorch/tree/v1.2.0#adjust-build-options-optional)
 76  # instructions. This will also add more flexibility around configurations
 77  # (allowing FBGEMM to be built in pytorch-1.1), and may future proof this
 78  # derivation.
 79  brokenArchs = [ "3.0" ]; # this variable is only used as documentation.
 80
 81  cudaCapabilities = rec {
 82    cuda9 = [
 83      "3.5"
 84      "5.0"
 85      "5.2"
 86      "6.0"
 87      "6.1"
 88      "7.0"
 89      "7.0+PTX"  # I am getting a "undefined architecture compute_75" on cuda 9
 90                 # which leads me to believe this is the final cuda-9-compatible architecture.
 91    ];
 92
 93    cuda10 = cuda9 ++ [
 94      "7.5"
 95      "7.5+PTX"  # < most recent architecture as of cudatoolkit_10_0 and pytorch-1.2.0
 96    ];
 97
 98    cuda11 = cuda10 ++ [
 99      "8.0"
100      "8.0+PTX"  # < CUDA toolkit 11.0
101      "8.6"
102      "8.6+PTX"  # < CUDA toolkit 11.1
103    ];
104  };
105  final_cudaArchList =
106    if !cudaSupport || cudaArchList != null
107    then cudaArchList
108    else cudaCapabilities."cuda${lib.versions.major cudatoolkit.version}";
109
110  # Normally libcuda.so.1 is provided at runtime by nvidia-x11 via
111  # LD_LIBRARY_PATH=/run/opengl-driver/lib.  We only use the stub
112  # libcuda.so from cudatoolkit for running tests, so that we don’t have
113  # to recompile pytorch on every update to nvidia-x11 or the kernel.
114  cudaStub = linkFarm "cuda-stub" [{
115    name = "libcuda.so.1";
116    path = "${cudatoolkit}/lib/stubs/libcuda.so";
117  }];
118  cudaStubEnv = lib.optionalString cudaSupport
119    "LD_LIBRARY_PATH=${cudaStub}\${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH ";
120
121in buildPythonPackage rec {
122  pname = "pytorch";
123  # Don't forget to update pytorch-bin to the same version.
124  version = "1.11.0";
125  format = "setuptools";
126
127  disabled = pythonOlder "3.7.0";
128
129  outputs = [
130    "out"   # output standard python package
131    "dev"   # output libtorch headers
132    "lib"   # output libtorch libraries
133  ];
134
135  src = fetchFromGitHub {
136    owner  = "pytorch";
137    repo   = "pytorch";
138    rev    = "v${version}";
139    fetchSubmodules = true;
140    sha256 = "sha256-CEu63tdRBAF8CTchO3Qu8gUNObQylX6U08yDTI4/c/0=";
141  };
142
143  patches = [
144    # Fix for a breakpad incompatibility with glibc>2.33
145    # https://github.com/pytorch/pytorch/issues/70297
146    # https://github.com/google/breakpad/commit/605c51ed96ad44b34c457bbca320e74e194c317e
147    ./breakpad-sigstksz.patch
148  ] ++ lib.optionals stdenv.isDarwin [
149    # pthreadpool added support for Grand Central Dispatch in April
150    # 2020. However, this relies on functionality (DISPATCH_APPLY_AUTO)
151    # that is available starting with macOS 10.13. However, our current
152    # base is 10.12. Until we upgrade, we can fall back on the older
153    # pthread support.
154    ./pthreadpool-disable-gcd.diff
155  ];
156
157  preConfigure = lib.optionalString cudaSupport ''
158    export TORCH_CUDA_ARCH_LIST="${lib.strings.concatStringsSep ";" final_cudaArchList}"
159    export CC=${cudatoolkit.cc}/bin/gcc CXX=${cudatoolkit.cc}/bin/g++
160  '' + lib.optionalString (cudaSupport && cudnn != null) ''
161    export CUDNN_INCLUDE_DIR=${cudnn}/include
162  '';
163
164  # Use pytorch's custom configurations
165  dontUseCmakeConfigure = true;
166
167  BUILD_NAMEDTENSOR = setBool true;
168  BUILD_DOCS = setBool buildDocs;
169
170  # We only do an imports check, so do not build tests either.
171  BUILD_TEST = setBool false;
172
173  # Unlike MKL, oneDNN (née MKLDNN) is FOSS, so we enable support for
174  # it by default. PyTorch currently uses its own vendored version
175  # of oneDNN through Intel iDeep.
176  USE_MKLDNN = setBool mklDnnSupport;
177  USE_MKLDNN_CBLAS = setBool mklDnnSupport;
178
179  preBuild = ''
180    export MAX_JOBS=$NIX_BUILD_CORES
181    ${python.interpreter} setup.py build --cmake-only
182    ${cmake}/bin/cmake build
183  '';
184
185  preFixup = ''
186    function join_by { local IFS="$1"; shift; echo "$*"; }
187    function strip2 {
188      IFS=':'
189      read -ra RP <<< $(patchelf --print-rpath $1)
190      IFS=' '
191      RP_NEW=$(join_by : ''${RP[@]:2})
192      patchelf --set-rpath \$ORIGIN:''${RP_NEW} "$1"
193    }
194    for f in $(find ''${out} -name 'libcaffe2*.so')
195    do
196      strip2 $f
197    done
198  '';
199
200  # Override the (weirdly) wrong version set by default. See
201  # https://github.com/NixOS/nixpkgs/pull/52437#issuecomment-449718038
202  # https://github.com/pytorch/pytorch/blob/v1.0.0/setup.py#L267
203  PYTORCH_BUILD_VERSION = version;
204  PYTORCH_BUILD_NUMBER = 0;
205
206  USE_SYSTEM_NCCL=setBool useSystemNccl;                  # don't build pytorch's third_party NCCL
207
208  # Suppress a weird warning in mkl-dnn, part of ideep in pytorch
209  # (upstream seems to have fixed this in the wrong place?)
210  # https://github.com/intel/mkl-dnn/commit/8134d346cdb7fe1695a2aa55771071d455fae0bc
211  # https://github.com/pytorch/pytorch/issues/22346
212  #
213  # Also of interest: pytorch ignores CXXFLAGS uses CFLAGS for both C and C++:
214  # https://github.com/pytorch/pytorch/blob/v1.11.0/setup.py#L17
215  NIX_CFLAGS_COMPILE = lib.optionals (blas.implementation == "mkl") [ "-Wno-error=array-bounds" ];
216
217  nativeBuildInputs = [
218    cmake
219    util-linux
220    which
221    ninja
222    pybind11
223    removeReferencesTo
224  ] ++ lib.optionals cudaSupport [ cudatoolkit_joined ];
225
226  buildInputs = [ blas blas.provider ]
227    ++ lib.optionals cudaSupport [ cudnn magma nccl ]
228    ++ lib.optionals stdenv.isLinux [ numactl ];
229
230  propagatedBuildInputs = [
231    cffi
232    click
233    numpy
234    pyyaml
235    typing-extensions
236    # the following are required for tensorboard support
237    pillow six future tensorboard protobuf
238  ] ++ lib.optionals MPISupport [ mpi ];
239
240  checkInputs = [ hypothesis ninja psutil ];
241
242  # Tests take a long time and may be flaky, so just sanity-check imports
243  doCheck = false;
244  pythonImportsCheck = [
245    "torch"
246  ];
247
248  checkPhase = with lib.versions; with lib.strings; concatStringsSep " " [
249    cudaStubEnv
250    "${python.interpreter} test/run_test.py"
251    "--exclude"
252    (concatStringsSep " " [
253      "utils" # utils requires git, which is not allowed in the check phase
254
255      # "dataloader" # psutils correctly finds and triggers multiprocessing, but is too sandboxed to run -- resulting in numerous errors
256      # ^^^^^^^^^^^^ NOTE: while test_dataloader does return errors, these are acceptable errors and do not interfere with the build
257
258      # tensorboard has acceptable failures for pytorch 1.3.x due to dependencies on tensorboard-plugins
259      (optionalString (majorMinor version == "1.3" ) "tensorboard")
260    ])
261  ];
262  postInstall = ''
263    find "$out/${python.sitePackages}/torch/include" "$out/${python.sitePackages}/torch/lib" -type f -exec remove-references-to -t ${stdenv.cc} '{}' +
264
265    mkdir $dev
266    cp -r $out/${python.sitePackages}/torch/include $dev/include
267    cp -r $out/${python.sitePackages}/torch/share   $dev/share
268
269    # Fix up library paths for split outputs
270    substituteInPlace \
271      $dev/share/cmake/Torch/TorchConfig.cmake \
272      --replace \''${TORCH_INSTALL_PREFIX}/lib "$lib/lib"
273
274    substituteInPlace \
275      $dev/share/cmake/Caffe2/Caffe2Targets-release.cmake \
276      --replace \''${_IMPORT_PREFIX}/lib "$lib/lib"
277
278    mkdir $lib
279    mv $out/${python.sitePackages}/torch/lib     $lib/lib
280    ln -s $lib/lib $out/${python.sitePackages}/torch/lib
281  '';
282
283  postFixup = lib.optionalString stdenv.isDarwin ''
284    for f in $(ls $lib/lib/*.dylib); do
285        install_name_tool -id $lib/lib/$(basename $f) $f || true
286    done
287
288    install_name_tool -change @rpath/libshm.dylib $lib/lib/libshm.dylib $lib/lib/libtorch_python.dylib
289    install_name_tool -change @rpath/libtorch.dylib $lib/lib/libtorch.dylib $lib/lib/libtorch_python.dylib
290    install_name_tool -change @rpath/libc10.dylib $lib/lib/libc10.dylib $lib/lib/libtorch_python.dylib
291
292    install_name_tool -change @rpath/libc10.dylib $lib/lib/libc10.dylib $lib/lib/libtorch.dylib
293
294    install_name_tool -change @rpath/libtorch.dylib $lib/lib/libtorch.dylib $lib/lib/libcaffe2_observers.dylib
295    install_name_tool -change @rpath/libc10.dylib $lib/lib/libc10.dylib $lib/lib/libcaffe2_observers.dylib
296
297    install_name_tool -change @rpath/libtorch.dylib $lib/lib/libtorch.dylib $lib/lib/libcaffe2_module_test_dynamic.dylib
298    install_name_tool -change @rpath/libc10.dylib $lib/lib/libc10.dylib $lib/lib/libcaffe2_module_test_dynamic.dylib
299
300    install_name_tool -change @rpath/libtorch.dylib $lib/lib/libtorch.dylib $lib/lib/libcaffe2_detectron_ops.dylib
301    install_name_tool -change @rpath/libc10.dylib $lib/lib/libc10.dylib $lib/lib/libcaffe2_detectron_ops.dylib
302
303    install_name_tool -change @rpath/libtorch.dylib $lib/lib/libtorch.dylib $lib/lib/libshm.dylib
304    install_name_tool -change @rpath/libc10.dylib $lib/lib/libc10.dylib $lib/lib/libshm.dylib
305  '';
306
307  # Builds in 2+h with 2 cores, and ~15m with a big-parallel builder.
308  requiredSystemFeatures = [ "big-parallel" ];
309
310  passthru = {
311    inherit cudaSupport cudaPackages;
312    cudaArchList = final_cudaArchList;
313    # At least for 1.10.2 `torch.fft` is unavailable unless BLAS provider is MKL. This attribute allows for easy detection of its availability.
314    blasProvider = blas.provider;
315  };
316
317  meta = with lib; {
318    # darwin: error: use of undeclared identifier 'noU'; did you mean 'no'?
319    broken = (stdenv.isLinux && stdenv.isAarch64) || stdenv.isDarwin;
320    description = "Open source, prototype-to-production deep learning platform";
321    homepage    = "https://pytorch.org/";
322    license     = licenses.bsd3;
323    platforms   = with platforms; linux ++ lib.optionals (!cudaSupport) darwin;
324    maintainers = with maintainers; [ teh thoughtpolice tscholak ]; # tscholak esp. for darwin-related builds
325  };
326}