pkgs/development/python-modules/pytorch/default.nix at 20.09 · tjh.dev/nixpkgs

tjh.dev / nixpkgs
Clone of https://github.com/NixOS/nixpkgs.git (to stress-test knotserver)
nixpkgs / pkgs / development / python-modules / pytorch / default.nix
at 20.09 11 kB view raw
  1{ stdenv, lib, fetchFromGitHub, fetchpatch, buildPythonPackage, python,
  2  cudaSupport ? false, cudatoolkit ? null, cudnn ? null, nccl ? null, magma ? null,
  3  mklDnnSupport ? true, useSystemNccl ? true,
  4  openMPISupport ? false, openmpi ? null,
  5  buildDocs ? false,
  6  cudaArchList ? null,
  7  numpy, pyyaml, cffi, click, typing, cmake, hypothesis, numactl, psutil,
  8  linkFarm, symlinkJoin,
  9
 10  # virtual pkg that consistently instantiates blas across nixpkgs
 11  # See https://github.com/NixOS/nixpkgs/pull/83888
 12  blas,
 13
 14  # ninja (https://ninja-build.org) must be available to run C++ extensions tests,
 15  ninja,
 16
 17  # dependencies for torch.utils.tensorboard
 18  pillow, six, future, tensorflow-tensorboard, protobuf,
 19
 20  utillinux, which, isPy3k }:
 21
 22assert !openMPISupport || openmpi != null;
 23
 24# assert that everything needed for cuda is present and that the correct cuda versions are used
 25assert !cudaSupport || cudatoolkit != null;
 26assert cudnn == null || cudatoolkit != null;
 27assert !cudaSupport || (let majorIs = lib.versions.major cudatoolkit.version;
 28                        in majorIs == "9" || majorIs == "10" || majorIs == "11");
 29
 30# confirm that cudatoolkits are sync'd across dependencies
 31assert !(openMPISupport && cudaSupport) || openmpi.cudatoolkit == cudatoolkit;
 32assert !cudaSupport || magma.cudatoolkit == cudatoolkit;
 33
 34let
 35  cudatoolkit_joined = symlinkJoin {
 36    name = "${cudatoolkit.name}-unsplit";
 37    # nccl is here purely for semantic grouping it could be moved to nativeBuildInputs
 38    paths = [ cudatoolkit.out cudatoolkit.lib nccl.dev nccl.out ];
 39  };
 40
 41  # Give an explicit list of supported architectures for the build, See:
 42  # - pytorch bug report: https://github.com/pytorch/pytorch/issues/23573
 43  # - pytorch-1.2.0 build on nixpks: https://github.com/NixOS/nixpkgs/pull/65041
 44  #
 45  # This list was selected by omitting the TORCH_CUDA_ARCH_LIST parameter,
 46  # observing the fallback option (which selected all architectures known
 47  # from cudatoolkit_10_0, pytorch-1.2, and python-3.6), and doing a binary
 48  # searching to find offending architectures.
 49  #
 50  # NOTE: Because of sandboxing, this derivation can't auto-detect the hardware's
 51  # cuda architecture, so there is also now a problem around new architectures
 52  # not being supported until explicitly added to this derivation.
 53  #
 54  # FIXME: CMake is throwing the following warning on python-1.2:
 55  #
 56  # ```
 57  # CMake Warning at cmake/public/utils.cmake:172 (message):
 58  #   In the future we will require one to explicitly pass TORCH_CUDA_ARCH_LIST
 59  #   to cmake instead of implicitly setting it as an env variable.  This will
 60  #   become a FATAL_ERROR in future version of pytorch.
 61  # ```
 62  # If this is causing problems for your build, this derivation may have to strip
 63  # away the standard `buildPythonPackage` and use the
 64  # [*Adjust Build Options*](https://github.com/pytorch/pytorch/tree/v1.2.0#adjust-build-options-optional)
 65  # instructions. This will also add more flexibility around configurations
 66  # (allowing FBGEMM to be built in pytorch-1.1), and may future proof this
 67  # derivation.
 68  brokenArchs = [ "3.0" ]; # this variable is only used as documentation.
 69  cuda9ArchList = [
 70    "3.5"
 71    "5.0"
 72    "5.2"
 73    "6.0"
 74    "6.1"
 75    "7.0"
 76    "7.0+PTX"  # I am getting a "undefined architecture compute_75" on cuda 9
 77               # which leads me to believe this is the final cuda-9-compatible architecture.
 78  ];
 79  cuda10ArchList = cuda9ArchList ++ [
 80    "7.5"
 81    "7.5+PTX"  # < most recent architecture as of cudatoolkit_10_0 and pytorch-1.2.0
 82  ];
 83  final_cudaArchList =
 84    if !cudaSupport || cudaArchList != null
 85    then cudaArchList
 86    else
 87      if lib.versions.major cudatoolkit.version == "9"
 88      then cuda9ArchList
 89      else cuda10ArchList; # the assert above removes any ambiguity here.
 90
 91  # Normally libcuda.so.1 is provided at runtime by nvidia-x11 via
 92  # LD_LIBRARY_PATH=/run/opengl-driver/lib.  We only use the stub
 93  # libcuda.so from cudatoolkit for running tests, so that we don’t have
 94  # to recompile pytorch on every update to nvidia-x11 or the kernel.
 95  cudaStub = linkFarm "cuda-stub" [{
 96    name = "libcuda.so.1";
 97    path = "${cudatoolkit}/lib/stubs/libcuda.so";
 98  }];
 99  cudaStubEnv = lib.optionalString cudaSupport
100    "LD_LIBRARY_PATH=${cudaStub}\${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH ";
101
102in buildPythonPackage rec {
103  pname = "pytorch";
104  # Don't forget to update pytorch-bin to the same version.
105  version = "1.6.0";
106
107  disabled = !isPy3k;
108
109  outputs = [
110    "out"   # output standard python package
111    "dev"   # output libtorch headers
112    "lib"   # output libtorch libraries
113  ];
114
115  src = fetchFromGitHub {
116    owner  = "pytorch";
117    repo   = "pytorch";
118    rev    = "v${version}";
119    fetchSubmodules = true;
120    sha256 = "14hhjsi6fnpaw9m1a3bhvdinsks6fhss6bbcrfk6jgns64abqdaz";
121  };
122
123  patches = lib.optionals stdenv.isAarch64 [
124    # GNU aarch64 assembler does not support 4s on neon mov:
125    # https://github.com/pytorch/pytorch/issues/33124
126    #
127    # Fix from:
128    # https://github.com/pytorch/pytorch/pull/40584
129    #
130    # This patch can be removed with the next major version (1.7.0).
131    (fetchpatch {
132      name = "qnnpack-neon-fix.patch";
133      url = "https://github.com/pytorch/pytorch/commit/7676682584d0caf9243bce74ea0a88711ec4a807.diff";
134      sha256 = "13spncaqlpsp8qk2850yly7xqwmhhfwznhmzkk8jgpslkbx75vgq";
135    })
136  ] ++ lib.optionals stdenv.isDarwin [
137    # pthreadpool added support for Grand Central Dispatch in April
138    # 2020. However, this relies on functionality (DISPATCH_APPLY_AUTO)
139    # that is available starting with macOS 10.13. However, our current
140    # base is 10.12. Until we upgrade, we can fall back on the older
141    # pthread support.
142    ./pthreadpool-disable-gcd.diff
143  ];
144
145  preConfigure = lib.optionalString cudaSupport ''
146    export TORCH_CUDA_ARCH_LIST="${lib.strings.concatStringsSep ";" final_cudaArchList}"
147    export CC=${cudatoolkit.cc}/bin/gcc CXX=${cudatoolkit.cc}/bin/g++
148  '' + lib.optionalString (cudaSupport && cudnn != null) ''
149    export CUDNN_INCLUDE_DIR=${cudnn}/include
150  '';
151
152  # Use pytorch's custom configurations
153  dontUseCmakeConfigure = true;
154
155  BUILD_NAMEDTENSOR = true;
156  BUILD_DOCS = buildDocs;
157
158  USE_MKL = blas.implementation == "mkl";
159
160  # Unlike MKL, oneDNN (née MKLDNN) is FOSS, so we enable support for
161  # it by default. PyTorch currently uses its own vendored version
162  # of oneDNN through Intel iDeep.
163  USE_MKLDNN = mklDnnSupport;
164  USE_MKLDNN_CBLAS = mklDnnSupport;
165
166  preBuild = ''
167    export MAX_JOBS=$NIX_BUILD_CORES
168    ${python.interpreter} setup.py build --cmake-only
169    ${cmake}/bin/cmake build
170  '';
171
172  preFixup = ''
173    function join_by { local IFS="$1"; shift; echo "$*"; }
174    function strip2 {
175      IFS=':'
176      read -ra RP <<< $(patchelf --print-rpath $1)
177      IFS=' '
178      RP_NEW=$(join_by : ''${RP[@]:2})
179      patchelf --set-rpath \$ORIGIN:''${RP_NEW} "$1"
180    }
181    for f in $(find ''${out} -name 'libcaffe2*.so')
182    do
183      strip2 $f
184    done
185  '';
186
187  # Override the (weirdly) wrong version set by default. See
188  # https://github.com/NixOS/nixpkgs/pull/52437#issuecomment-449718038
189  # https://github.com/pytorch/pytorch/blob/v1.0.0/setup.py#L267
190  PYTORCH_BUILD_VERSION = version;
191  PYTORCH_BUILD_NUMBER = 0;
192
193  USE_SYSTEM_NCCL=useSystemNccl;                  # don't build pytorch's third_party NCCL
194
195  # Suppress a weird warning in mkl-dnn, part of ideep in pytorch
196  # (upstream seems to have fixed this in the wrong place?)
197  # https://github.com/intel/mkl-dnn/commit/8134d346cdb7fe1695a2aa55771071d455fae0bc
198  # https://github.com/pytorch/pytorch/issues/22346
199  #
200  # Also of interest: pytorch ignores CXXFLAGS uses CFLAGS for both C and C++:
201  # https://github.com/pytorch/pytorch/blob/v1.2.0/setup.py#L17
202  NIX_CFLAGS_COMPILE = lib.optionals (blas.implementation == "mkl") [ "-Wno-error=array-bounds" ];
203
204  nativeBuildInputs = [
205    cmake
206    utillinux
207    which
208    ninja
209  ] ++ lib.optionals cudaSupport [ cudatoolkit_joined ];
210
211  buildInputs = [ blas blas.provider ]
212    ++ lib.optionals cudaSupport [ cudnn magma nccl ]
213    ++ lib.optionals stdenv.isLinux [ numactl ];
214
215  propagatedBuildInputs = [
216    cffi
217    click
218    numpy
219    pyyaml
220    # the following are required for tensorboard support
221    pillow six future tensorflow-tensorboard protobuf
222  ] ++ lib.optionals openMPISupport [ openmpi ];
223
224  checkInputs = [ hypothesis ninja psutil ];
225
226  # Tests take a long time and may be flaky, so just sanity-check imports
227  doCheck = false;
228  pythonImportsCheck = [
229    "torch"
230  ];
231
232  checkPhase = with lib.versions; with lib.strings; concatStringsSep " " [
233    cudaStubEnv
234    "${python.interpreter} test/run_test.py"
235    "--exclude"
236    (concatStringsSep " " [
237      "utils" # utils requires git, which is not allowed in the check phase
238
239      # "dataloader" # psutils correctly finds and triggers multiprocessing, but is too sandboxed to run -- resulting in numerous errors
240      # ^^^^^^^^^^^^ NOTE: while test_dataloader does return errors, these are acceptable errors and do not interfere with the build
241
242      # tensorboard has acceptable failures for pytorch 1.3.x due to dependencies on tensorboard-plugins
243      (optionalString (majorMinor version == "1.3" ) "tensorboard")
244    ])
245  ];
246  postInstall = ''
247    mkdir $dev
248    cp -r $out/${python.sitePackages}/torch/include $dev/include
249    cp -r $out/${python.sitePackages}/torch/share   $dev/share
250
251    mkdir $lib
252    cp -r $out/${python.sitePackages}/torch/lib     $lib/lib
253  '';
254
255  postFixup = stdenv.lib.optionalString stdenv.isDarwin ''
256    for f in $(ls $lib/lib/*.dylib); do
257        install_name_tool -id $lib/lib/$(basename $f) $f || true
258    done
259
260    install_name_tool -change @rpath/libshm.dylib $lib/lib/libshm.dylib $lib/lib/libtorch_python.dylib
261    install_name_tool -change @rpath/libtorch.dylib $lib/lib/libtorch.dylib $lib/lib/libtorch_python.dylib
262    install_name_tool -change @rpath/libc10.dylib $lib/lib/libc10.dylib $lib/lib/libtorch_python.dylib
263
264    install_name_tool -change @rpath/libc10.dylib $lib/lib/libc10.dylib $lib/lib/libtorch.dylib
265
266    install_name_tool -change @rpath/libtorch.dylib $lib/lib/libtorch.dylib $lib/lib/libcaffe2_observers.dylib
267    install_name_tool -change @rpath/libc10.dylib $lib/lib/libc10.dylib $lib/lib/libcaffe2_observers.dylib
268
269    install_name_tool -change @rpath/libtorch.dylib $lib/lib/libtorch.dylib $lib/lib/libcaffe2_module_test_dynamic.dylib
270    install_name_tool -change @rpath/libc10.dylib $lib/lib/libc10.dylib $lib/lib/libcaffe2_module_test_dynamic.dylib
271
272    install_name_tool -change @rpath/libtorch.dylib $lib/lib/libtorch.dylib $lib/lib/libcaffe2_detectron_ops.dylib
273    install_name_tool -change @rpath/libc10.dylib $lib/lib/libc10.dylib $lib/lib/libcaffe2_detectron_ops.dylib
274
275    install_name_tool -change @rpath/libtorch.dylib $lib/lib/libtorch.dylib $lib/lib/libshm.dylib
276    install_name_tool -change @rpath/libc10.dylib $lib/lib/libc10.dylib $lib/lib/libshm.dylib
277  '';
278
279
280  meta = {
281    description = "Open source, prototype-to-production deep learning platform";
282    homepage    = "https://pytorch.org/";
283    license     = lib.licenses.bsd3;
284    platforms   = with lib.platforms; linux ++ lib.optionals (!cudaSupport) darwin;
285    maintainers = with lib.maintainers; [ teh thoughtpolice tscholak ]; # tscholak esp. for darwin-related builds
286  };
287}