pkgs/development/python-modules/vllm/default.nix at flake-libs

tjh.dev / nixpkgs
fork atom
nixpkgs mirror (for testing) github.com/NixOS/nixpkgs
nix
fork atom
nixpkgs / pkgs / development / python-modules / vllm / default.nix
at flake-libs 438 lines 11 kB view raw
wrap content
  1{
  2  lib,
  3  stdenv,
  4  python,
  5  buildPythonPackage,
  6  pythonRelaxDepsHook,
  7  fetchFromGitHub,
  8  symlinkJoin,
  9  autoAddDriverRunpath,
 10
 11  # build system
 12  cmake,
 13  jinja2,
 14  ninja,
 15  packaging,
 16  setuptools,
 17  setuptools-scm,
 18  wheel,
 19
 20  # dependencies
 21  which,
 22  torch,
 23  outlines,
 24  psutil,
 25  ray,
 26  pandas,
 27  pyarrow,
 28  sentencepiece,
 29  numpy,
 30  transformers,
 31  xformers,
 32  xgrammar,
 33  numba,
 34  fastapi,
 35  uvicorn,
 36  pydantic,
 37  aioprometheus,
 38  pynvml,
 39  openai,
 40  pyzmq,
 41  tiktoken,
 42  torchaudio,
 43  torchvision,
 44  py-cpuinfo,
 45  lm-format-enforcer,
 46  prometheus-fastapi-instrumentator,
 47  cupy,
 48  gguf,
 49  einops,
 50  importlib-metadata,
 51  partial-json-parser,
 52  compressed-tensors,
 53  mistral-common,
 54  msgspec,
 55  numactl,
 56  tokenizers,
 57  oneDNN,
 58  blake3,
 59  depyf,
 60  opencv-python-headless,
 61  cachetools,
 62  llguidance,
 63  python-json-logger,
 64  python-multipart,
 65  llvmPackages,
 66
 67  cudaSupport ? torch.cudaSupport,
 68  cudaPackages ? { },
 69  rocmSupport ? torch.rocmSupport,
 70  rocmPackages ? { },
 71  gpuTargets ? [ ],
 72}:
 73
 74let
 75  inherit (lib)
 76    lists
 77    strings
 78    trivial
 79    ;
 80
 81  inherit (cudaPackages) flags;
 82
 83  shouldUsePkg =
 84    pkg: if pkg != null && lib.meta.availableOn stdenv.hostPlatform pkg then pkg else null;
 85
 86  # see CMakeLists.txt, grepping for GIT_TAG near cutlass
 87  # https://github.com/vllm-project/vllm/blob/v${version}/CMakeLists.txt
 88  cutlass = fetchFromGitHub {
 89    owner = "NVIDIA";
 90    repo = "cutlass";
 91    tag = "v3.8.0";
 92    hash = "sha256-oIzlbKRdOh6gp6nRZ8udLSqleBFoFtgM7liCBlHZLOk=";
 93  };
 94
 95  flashmla = stdenv.mkDerivation {
 96    pname = "flashmla";
 97    # https://github.com/vllm-project/FlashMLA/blob/${src.rev}/setup.py
 98    version = "1.0.0";
 99
100    # grep for GIT_TAG in the following file
101    # https://github.com/vllm-project/vllm/blob/v${version}/cmake/external_projects/flashmla.cmake
102    src = fetchFromGitHub {
103      owner = "vllm-project";
104      repo = "FlashMLA";
105      rev = "575f7724b9762f265bbee5889df9c7d630801845";
106      hash = "sha256-8WrKMl0olr0nYV4FRJfwSaJ0F5gWQpssoFMjr9tbHBk=";
107    };
108
109    dontConfigure = true;
110
111    # flashmla normally relies on `git submodule update` to fetch cutlass
112    buildPhase = ''
113      rm -rf csrc/cutlass
114      ln -sf ${cutlass} csrc/cutlass
115    '';
116
117    installPhase = ''
118      cp -rva . $out
119    '';
120  };
121
122  vllm-flash-attn = stdenv.mkDerivation {
123    pname = "vllm-flash-attn";
124    # https://github.com/vllm-project/flash-attention/blob/${src.rev}/vllm_flash_attn/__init__.py
125    version = "2.7.2.post1";
126
127    # grep for GIT_TAG in the following file
128    # https://github.com/vllm-project/vllm/blob/v${version}/cmake/external_projects/vllm_flash_attn.cmake
129    src = fetchFromGitHub {
130      owner = "vllm-project";
131      repo = "flash-attention";
132      rev = "dc9d410b3e2d6534a4c70724c2515f4def670a22";
133      hash = "sha256-ZQ0bOBIb+8IMmya8dmimKQ17KTBplX81IirdnBJpX5M=";
134    };
135
136    dontConfigure = true;
137
138    # vllm-flash-attn normally relies on `git submodule update` to fetch cutlass
139    buildPhase = ''
140      rm -rf csrc/cutlass
141      ln -sf ${cutlass} csrc/cutlass
142    '';
143
144    installPhase = ''
145      cp -rva . $out
146    '';
147  };
148
149  cpuSupport = !cudaSupport && !rocmSupport;
150
151  # https://github.com/pytorch/pytorch/blob/v2.6.0/torch/utils/cpp_extension.py#L2046-L2048
152  supportedTorchCudaCapabilities =
153    let
154      real = [
155        "3.5"
156        "3.7"
157        "5.0"
158        "5.2"
159        "5.3"
160        "6.0"
161        "6.1"
162        "6.2"
163        "7.0"
164        "7.2"
165        "7.5"
166        "8.0"
167        "8.6"
168        "8.7"
169        "8.9"
170        "9.0"
171        "9.0a"
172        "10.0"
173      ];
174      ptx = lists.map (x: "${x}+PTX") real;
175    in
176    real ++ ptx;
177
178  # NOTE: The lists.subtractLists function is perhaps a bit unintuitive. It subtracts the elements
179  #   of the first list *from* the second list. That means:
180  #   lists.subtractLists a b = b - a
181
182  # For CUDA
183  supportedCudaCapabilities = lists.intersectLists flags.cudaCapabilities supportedTorchCudaCapabilities;
184  unsupportedCudaCapabilities = lists.subtractLists supportedCudaCapabilities flags.cudaCapabilities;
185
186  isCudaJetson = cudaSupport && cudaPackages.flags.isJetsonBuild;
187
188  # Use trivial.warnIf to print a warning if any unsupported GPU targets are specified.
189  gpuArchWarner =
190    supported: unsupported:
191    trivial.throwIf (supported == [ ]) (
192      "No supported GPU targets specified. Requested GPU targets: "
193      + strings.concatStringsSep ", " unsupported
194    ) supported;
195
196  # Create the gpuTargetString.
197  gpuTargetString = strings.concatStringsSep ";" (
198    if gpuTargets != [ ] then
199      # If gpuTargets is specified, it always takes priority.
200      gpuTargets
201    else if cudaSupport then
202      gpuArchWarner supportedCudaCapabilities unsupportedCudaCapabilities
203    else if rocmSupport then
204      rocmPackages.clr.gpuTargets
205    else
206      throw "No GPU targets specified"
207  );
208
209  mergedCudaLibraries = with cudaPackages; [
210    cuda_cudart # cuda_runtime.h, -lcudart
211    cuda_cccl
212    libcusparse # cusparse.h
213    libcusolver # cusolverDn.h
214    cuda_nvtx
215    cuda_nvrtc
216    libcublas
217  ];
218
219  # Some packages are not available on all platforms
220  nccl = shouldUsePkg (cudaPackages.nccl or null);
221
222  getAllOutputs = p: [
223    (lib.getBin p)
224    (lib.getLib p)
225    (lib.getDev p)
226  ];
227
228in
229
230buildPythonPackage rec {
231  pname = "vllm";
232  version = "0.8.3";
233  pyproject = true;
234
235  stdenv = torch.stdenv;
236
237  src = fetchFromGitHub {
238    owner = "vllm-project";
239    repo = "vllm";
240    tag = "v${version}";
241    hash = "sha256-LiEBkVwJTT4WoCTk9pI0ykTjmv1pDMzksmFwVktoxMY=";
242  };
243
244  patches = [
245    ./0002-setup.py-nix-support-respect-cmakeFlags.patch
246    ./0003-propagate-pythonpath.patch
247    ./0004-drop-lsmod.patch
248  ];
249
250  postPatch =
251    ''
252      # pythonRelaxDeps does not cover build-system
253      substituteInPlace pyproject.toml \
254        --replace-fail "torch ==" "torch >="
255
256      # Ignore the python version check because it hard-codes minor versions and
257      # lags behind `ray`'s python interpreter support
258      substituteInPlace CMakeLists.txt \
259        --replace-fail \
260          'set(PYTHON_SUPPORTED_VERSIONS' \
261          'set(PYTHON_SUPPORTED_VERSIONS "${lib.versions.majorMinor python.version}"'
262    ''
263    + lib.optionalString (nccl == null) ''
264      # On platforms where NCCL is not supported (e.g. Jetson), substitute Gloo (provided by Torch)
265      substituteInPlace vllm/distributed/parallel_state.py \
266        --replace-fail '"nccl"' '"gloo"'
267    '';
268
269  nativeBuildInputs =
270    [
271      which
272    ]
273    ++ lib.optionals rocmSupport [
274      rocmPackages.hipcc
275    ]
276    ++ lib.optionals cudaSupport [
277      cudaPackages.cuda_nvcc
278      autoAddDriverRunpath
279    ]
280    ++ lib.optionals isCudaJetson [
281      cudaPackages.autoAddCudaCompatRunpath
282    ];
283
284  build-system = [
285    cmake
286    jinja2
287    ninja
288    packaging
289    setuptools
290    setuptools-scm
291    torch
292  ];
293
294  buildInputs =
295    lib.optionals cpuSupport [
296      oneDNN
297    ]
298    ++ lib.optionals (cpuSupport && stdenv.isLinux) [
299      numactl
300    ]
301    ++ lib.optionals cudaSupport (
302      mergedCudaLibraries
303      ++ (with cudaPackages; [
304        nccl
305        cudnn
306        libcufile
307      ])
308    )
309    ++ lib.optionals rocmSupport (
310      with rocmPackages;
311      [
312        clr
313        rocthrust
314        rocprim
315        hipsparse
316        hipblas
317      ]
318    )
319    ++ lib.optionals stdenv.cc.isClang [
320      llvmPackages.openmp
321    ];
322
323  dependencies =
324    [
325      aioprometheus
326      blake3
327      cachetools
328      depyf
329      fastapi
330      llguidance
331      lm-format-enforcer
332      numpy
333      openai
334      opencv-python-headless
335      outlines
336      pandas
337      prometheus-fastapi-instrumentator
338      psutil
339      py-cpuinfo
340      pyarrow
341      pydantic
342      python-json-logger
343      python-multipart
344      pyzmq
345      ray
346      sentencepiece
347      tiktoken
348      tokenizers
349      msgspec
350      gguf
351      einops
352      importlib-metadata
353      partial-json-parser
354      compressed-tensors
355      mistral-common
356      torch
357      torchaudio
358      torchvision
359      transformers
360      uvicorn
361      xformers
362      xgrammar
363      numba
364    ]
365    ++ uvicorn.optional-dependencies.standard
366    ++ aioprometheus.optional-dependencies.starlette
367    ++ lib.optionals cudaSupport [
368      cupy
369      pynvml
370    ];
371
372  dontUseCmakeConfigure = true;
373  cmakeFlags =
374    [
375      (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_CUTLASS" "${lib.getDev cutlass}")
376      (lib.cmakeFeature "FLASH_MLA_SRC_DIR" "${lib.getDev flashmla}")
377      (lib.cmakeFeature "VLLM_FLASH_ATTN_SRC_DIR" "${lib.getDev vllm-flash-attn}")
378    ]
379    ++ lib.optionals cudaSupport [
380      (lib.cmakeFeature "TORCH_CUDA_ARCH_LIST" "${gpuTargetString}")
381      (lib.cmakeFeature "CUTLASS_NVCC_ARCHS_ENABLED" "${cudaPackages.flags.cmakeCudaArchitecturesString}")
382      (lib.cmakeFeature "CUDA_TOOLKIT_ROOT_DIR" "${symlinkJoin {
383        name = "cuda-merged-${cudaPackages.cudaMajorMinorVersion}";
384        paths = builtins.concatMap getAllOutputs mergedCudaLibraries;
385      }}")
386      (lib.cmakeFeature "CAFFE2_USE_CUDNN" "ON")
387      (lib.cmakeFeature "CAFFE2_USE_CUFILE" "ON")
388      (lib.cmakeFeature "CUTLASS_ENABLE_CUBLAS" "ON")
389    ]
390    ++ lib.optionals cpuSupport [
391      (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_ONEDNN" "${lib.getDev oneDNN}")
392    ];
393
394  env =
395    lib.optionalAttrs cudaSupport {
396      VLLM_TARGET_DEVICE = "cuda";
397      CUDA_HOME = "${lib.getDev cudaPackages.cuda_nvcc}";
398    }
399    // lib.optionalAttrs rocmSupport {
400      VLLM_TARGET_DEVICE = "rocm";
401      # Otherwise it tries to enumerate host supported ROCM gfx archs, and that is not possible due to sandboxing.
402      PYTORCH_ROCM_ARCH = lib.strings.concatStringsSep ";" rocmPackages.clr.gpuTargets;
403      ROCM_HOME = "${rocmPackages.clr}";
404    }
405    // lib.optionalAttrs cpuSupport {
406      VLLM_TARGET_DEVICE = "cpu";
407    };
408
409  preConfigure = ''
410    # See: https://github.com/vllm-project/vllm/blob/v0.7.1/setup.py#L75-L109
411    # There's also NVCC_THREADS but Nix/Nixpkgs doesn't really have this concept.
412    export MAX_JOBS="$NIX_BUILD_CORES"
413  '';
414
415  pythonRelaxDeps = true;
416
417  pythonImportsCheck = [ "vllm" ];
418
419  # updates the cutlass fetcher instead
420  passthru.skipBulkUpdate = true;
421
422  meta = with lib; {
423    description = "High-throughput and memory-efficient inference and serving engine for LLMs";
424    changelog = "https://github.com/vllm-project/vllm/releases/tag/v${version}";
425    homepage = "https://github.com/vllm-project/vllm";
426    license = licenses.asl20;
427    maintainers = with maintainers; [
428      happysalada
429      lach
430    ];
431    badPlatforms = [
432      # CMake Error at cmake/cpu_extension.cmake:78 (find_isa):
433      # find_isa Function invoked with incorrect arguments for function named:
434      # find_isa
435      "x86_64-darwin"
436    ];
437  };
438}