pkgs/development/python-modules/vllm/default.nix at 25.11-pre · pyrox.dev/nixpkgs

pyrox.dev / nixpkgs
lol
nixpkgs / pkgs / development / python-modules / vllm / default.nix
at 25.11-pre 11 kB view raw
  1{
  2  lib,
  3  stdenv,
  4  python,
  5  buildPythonPackage,
  6  pythonRelaxDepsHook,
  7  fetchFromGitHub,
  8  symlinkJoin,
  9  autoAddDriverRunpath,
 10
 11  # build system
 12  packaging,
 13  setuptools,
 14  wheel,
 15
 16  # dependencies
 17  which,
 18  ninja,
 19  cmake,
 20  setuptools-scm,
 21  torch,
 22  outlines,
 23  psutil,
 24  ray,
 25  pandas,
 26  pyarrow,
 27  sentencepiece,
 28  numpy,
 29  transformers,
 30  xformers,
 31  xgrammar,
 32  numba,
 33  fastapi,
 34  uvicorn,
 35  pydantic,
 36  aioprometheus,
 37  pynvml,
 38  openai,
 39  pyzmq,
 40  tiktoken,
 41  torchaudio,
 42  torchvision,
 43  py-cpuinfo,
 44  lm-format-enforcer,
 45  prometheus-fastapi-instrumentator,
 46  cupy,
 47  gguf,
 48  einops,
 49  importlib-metadata,
 50  partial-json-parser,
 51  compressed-tensors,
 52  mistral-common,
 53  msgspec,
 54  numactl,
 55  tokenizers,
 56  oneDNN,
 57  blake3,
 58  depyf,
 59  opencv-python-headless,
 60  cachetools,
 61  llguidance,
 62  python-json-logger,
 63  python-multipart,
 64  llvmPackages,
 65
 66  cudaSupport ? torch.cudaSupport,
 67  cudaPackages ? { },
 68  rocmSupport ? torch.rocmSupport,
 69  rocmPackages ? { },
 70  gpuTargets ? [ ],
 71}:
 72
 73let
 74  inherit (lib)
 75    lists
 76    strings
 77    trivial
 78    ;
 79
 80  inherit (cudaPackages) flags;
 81
 82  shouldUsePkg =
 83    pkg: if pkg != null && lib.meta.availableOn stdenv.hostPlatform pkg then pkg else null;
 84
 85  # see CMakeLists.txt, grepping for GIT_TAG near cutlass
 86  # https://github.com/vllm-project/vllm/blob/v${version}/CMakeLists.txt
 87  cutlass = fetchFromGitHub {
 88    owner = "NVIDIA";
 89    repo = "cutlass";
 90    tag = "v3.8.0";
 91    hash = "sha256-oIzlbKRdOh6gp6nRZ8udLSqleBFoFtgM7liCBlHZLOk=";
 92  };
 93
 94  flashmla = stdenv.mkDerivation {
 95    pname = "flashmla";
 96    # https://github.com/vllm-project/FlashMLA/blob/${src.rev}/setup.py
 97    version = "1.0.0";
 98
 99    # grep for GIT_TAG in the following file
100    # https://github.com/vllm-project/vllm/blob/v${version}/cmake/external_projects/flashmla.cmake
101    src = fetchFromGitHub {
102      owner = "vllm-project";
103      repo = "FlashMLA";
104      rev = "575f7724b9762f265bbee5889df9c7d630801845";
105      hash = "sha256-8WrKMl0olr0nYV4FRJfwSaJ0F5gWQpssoFMjr9tbHBk=";
106    };
107
108    dontConfigure = true;
109
110    # flashmla normally relies on `git submodule update` to fetch cutlass
111    buildPhase = ''
112      rm -rf csrc/cutlass
113      ln -sf ${cutlass} csrc/cutlass
114    '';
115
116    installPhase = ''
117      cp -rva . $out
118    '';
119  };
120
121  vllm-flash-attn = stdenv.mkDerivation {
122    pname = "vllm-flash-attn";
123    # https://github.com/vllm-project/flash-attention/blob/${src.rev}/vllm_flash_attn/__init__.py
124    version = "2.7.2.post1";
125
126    # grep for GIT_TAG in the following file
127    # https://github.com/vllm-project/vllm/blob/v${version}/cmake/external_projects/vllm_flash_attn.cmake
128    src = fetchFromGitHub {
129      owner = "vllm-project";
130      repo = "flash-attention";
131      rev = "dc9d410b3e2d6534a4c70724c2515f4def670a22";
132      hash = "sha256-ZQ0bOBIb+8IMmya8dmimKQ17KTBplX81IirdnBJpX5M=";
133    };
134
135    dontConfigure = true;
136
137    # vllm-flash-attn normally relies on `git submodule update` to fetch cutlass
138    buildPhase = ''
139      rm -rf csrc/cutlass
140      ln -sf ${cutlass} csrc/cutlass
141    '';
142
143    installPhase = ''
144      cp -rva . $out
145    '';
146  };
147
148  cpuSupport = !cudaSupport && !rocmSupport;
149
150  # https://github.com/pytorch/pytorch/blob/v2.6.0/torch/utils/cpp_extension.py#L2046-L2048
151  supportedTorchCudaCapabilities =
152    let
153      real = [
154        "3.5"
155        "3.7"
156        "5.0"
157        "5.2"
158        "5.3"
159        "6.0"
160        "6.1"
161        "6.2"
162        "7.0"
163        "7.2"
164        "7.5"
165        "8.0"
166        "8.6"
167        "8.7"
168        "8.9"
169        "9.0"
170        "9.0a"
171        "10.0"
172      ];
173      ptx = lists.map (x: "${x}+PTX") real;
174    in
175    real ++ ptx;
176
177  # NOTE: The lists.subtractLists function is perhaps a bit unintuitive. It subtracts the elements
178  #   of the first list *from* the second list. That means:
179  #   lists.subtractLists a b = b - a
180
181  # For CUDA
182  supportedCudaCapabilities = lists.intersectLists flags.cudaCapabilities supportedTorchCudaCapabilities;
183  unsupportedCudaCapabilities = lists.subtractLists supportedCudaCapabilities flags.cudaCapabilities;
184
185  isCudaJetson = cudaSupport && cudaPackages.flags.isJetsonBuild;
186
187  # Use trivial.warnIf to print a warning if any unsupported GPU targets are specified.
188  gpuArchWarner =
189    supported: unsupported:
190    trivial.throwIf (supported == [ ]) (
191      "No supported GPU targets specified. Requested GPU targets: "
192      + strings.concatStringsSep ", " unsupported
193    ) supported;
194
195  # Create the gpuTargetString.
196  gpuTargetString = strings.concatStringsSep ";" (
197    if gpuTargets != [ ] then
198      # If gpuTargets is specified, it always takes priority.
199      gpuTargets
200    else if cudaSupport then
201      gpuArchWarner supportedCudaCapabilities unsupportedCudaCapabilities
202    else if rocmSupport then
203      rocmPackages.clr.gpuTargets
204    else
205      throw "No GPU targets specified"
206  );
207
208  mergedCudaLibraries = with cudaPackages; [
209    cuda_cudart # cuda_runtime.h, -lcudart
210    cuda_cccl
211    libcusparse # cusparse.h
212    libcusolver # cusolverDn.h
213    cuda_nvtx
214    cuda_nvrtc
215    libcublas
216  ];
217
218  # Some packages are not available on all platforms
219  nccl = shouldUsePkg (cudaPackages.nccl or null);
220
221  getAllOutputs = p: [
222    (lib.getBin p)
223    (lib.getLib p)
224    (lib.getDev p)
225  ];
226
227in
228
229buildPythonPackage rec {
230  pname = "vllm";
231  version = "0.8.3";
232  pyproject = true;
233
234  stdenv = torch.stdenv;
235
236  src = fetchFromGitHub {
237    owner = "vllm-project";
238    repo = pname;
239    tag = "v${version}";
240    hash = "sha256-LiEBkVwJTT4WoCTk9pI0ykTjmv1pDMzksmFwVktoxMY=";
241  };
242
243  patches = [
244    ./0002-setup.py-nix-support-respect-cmakeFlags.patch
245    ./0003-propagate-pythonpath.patch
246    ./0004-drop-lsmod.patch
247  ];
248
249  # Ignore the python version check because it hard-codes minor versions and
250  # lags behind `ray`'s python interpreter support
251  postPatch =
252    ''
253      substituteInPlace CMakeLists.txt \
254        --replace-fail \
255          'set(PYTHON_SUPPORTED_VERSIONS' \
256          'set(PYTHON_SUPPORTED_VERSIONS "${lib.versions.majorMinor python.version}"'
257    ''
258    + lib.optionalString (nccl == null) ''
259      # On platforms where NCCL is not supported (e.g. Jetson), substitute Gloo (provided by Torch)
260      substituteInPlace vllm/distributed/parallel_state.py \
261        --replace-fail '"nccl"' '"gloo"'
262    '';
263
264  nativeBuildInputs =
265    [
266      cmake
267      ninja
268      pythonRelaxDepsHook
269      which
270    ]
271    ++ lib.optionals rocmSupport [
272      rocmPackages.hipcc
273    ]
274    ++ lib.optionals cudaSupport [
275      cudaPackages.cuda_nvcc
276      autoAddDriverRunpath
277    ]
278    ++ lib.optionals isCudaJetson [
279      cudaPackages.autoAddCudaCompatRunpath
280    ];
281
282  build-system = [
283    packaging
284    setuptools
285    wheel
286  ];
287
288  buildInputs =
289    [
290      setuptools-scm
291      torch
292    ]
293    ++ lib.optionals cpuSupport [
294      oneDNN
295    ]
296    ++ lib.optionals (cpuSupport && stdenv.isLinux) [
297      numactl
298    ]
299    ++ lib.optionals cudaSupport (
300      mergedCudaLibraries
301      ++ (with cudaPackages; [
302        nccl
303        cudnn
304        libcufile
305      ])
306    )
307    ++ lib.optionals rocmSupport (
308      with rocmPackages;
309      [
310        clr
311        rocthrust
312        rocprim
313        hipsparse
314        hipblas
315      ]
316    )
317    ++ lib.optionals stdenv.cc.isClang [
318      llvmPackages.openmp
319    ];
320
321  dependencies =
322    [
323      aioprometheus
324      blake3
325      cachetools
326      depyf
327      fastapi
328      llguidance
329      lm-format-enforcer
330      numpy
331      openai
332      opencv-python-headless
333      outlines
334      pandas
335      prometheus-fastapi-instrumentator
336      psutil
337      py-cpuinfo
338      pyarrow
339      pydantic
340      python-json-logger
341      python-multipart
342      pyzmq
343      ray
344      sentencepiece
345      tiktoken
346      tokenizers
347      msgspec
348      gguf
349      einops
350      importlib-metadata
351      partial-json-parser
352      compressed-tensors
353      mistral-common
354      torch
355      torchaudio
356      torchvision
357      transformers
358      uvicorn
359      xformers
360      xgrammar
361      numba
362    ]
363    ++ uvicorn.optional-dependencies.standard
364    ++ aioprometheus.optional-dependencies.starlette
365    ++ lib.optionals cudaSupport [
366      cupy
367      pynvml
368    ];
369
370  dontUseCmakeConfigure = true;
371  cmakeFlags =
372    [
373      (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_CUTLASS" "${lib.getDev cutlass}")
374      (lib.cmakeFeature "FLASH_MLA_SRC_DIR" "${lib.getDev flashmla}")
375      (lib.cmakeFeature "VLLM_FLASH_ATTN_SRC_DIR" "${lib.getDev vllm-flash-attn}")
376    ]
377    ++ lib.optionals cudaSupport [
378      (lib.cmakeFeature "TORCH_CUDA_ARCH_LIST" "${gpuTargetString}")
379      (lib.cmakeFeature "CUTLASS_NVCC_ARCHS_ENABLED" "${cudaPackages.flags.cmakeCudaArchitecturesString}")
380      (lib.cmakeFeature "CUDA_TOOLKIT_ROOT_DIR" "${symlinkJoin {
381        name = "cuda-merged-${cudaPackages.cudaMajorMinorVersion}";
382        paths = builtins.concatMap getAllOutputs mergedCudaLibraries;
383      }}")
384      (lib.cmakeFeature "CAFFE2_USE_CUDNN" "ON")
385      (lib.cmakeFeature "CAFFE2_USE_CUFILE" "ON")
386      (lib.cmakeFeature "CUTLASS_ENABLE_CUBLAS" "ON")
387    ]
388    ++ lib.optionals cpuSupport [
389      (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_ONEDNN" "${lib.getDev oneDNN}")
390    ];
391
392  env =
393    lib.optionalAttrs cudaSupport {
394      VLLM_TARGET_DEVICE = "cuda";
395      CUDA_HOME = "${lib.getDev cudaPackages.cuda_nvcc}";
396    }
397    // lib.optionalAttrs rocmSupport {
398      VLLM_TARGET_DEVICE = "rocm";
399      # Otherwise it tries to enumerate host supported ROCM gfx archs, and that is not possible due to sandboxing.
400      PYTORCH_ROCM_ARCH = lib.strings.concatStringsSep ";" rocmPackages.clr.gpuTargets;
401      ROCM_HOME = "${rocmPackages.clr}";
402    }
403    // lib.optionalAttrs cpuSupport {
404      VLLM_TARGET_DEVICE = "cpu";
405    };
406
407  preConfigure = ''
408    # See: https://github.com/vllm-project/vllm/blob/v0.7.1/setup.py#L75-L109
409    # There's also NVCC_THREADS but Nix/Nixpkgs doesn't really have this concept.
410    export MAX_JOBS="$NIX_BUILD_CORES"
411  '';
412
413  pythonRelaxDeps = true;
414
415  pythonImportsCheck = [ "vllm" ];
416
417  # updates the cutlass fetcher instead
418  passthru.skipBulkUpdate = true;
419
420  meta = with lib; {
421    description = "High-throughput and memory-efficient inference and serving engine for LLMs";
422    changelog = "https://github.com/vllm-project/vllm/releases/tag/v${version}";
423    homepage = "https://github.com/vllm-project/vllm";
424    license = licenses.asl20;
425    maintainers = with maintainers; [
426      happysalada
427      lach
428    ];
429    badPlatforms = [
430      # CMake Error at cmake/cpu_extension.cmake:78 (find_isa):
431      # find_isa Function invoked with incorrect arguments for function named:
432      # find_isa
433      "x86_64-darwin"
434    ];
435  };
436}