pkgs/development/python-modules/vllm/default.nix at devShellTools-shell · tjh.dev/nixpkgs

tjh.dev / nixpkgs
Clone of https://github.com/NixOS/nixpkgs.git (to stress-test knotserver)
nixpkgs / pkgs / development / python-modules / vllm / default.nix
at devShellTools-shell 12 kB view raw
  1{
  2  lib,
  3  stdenv,
  4  python,
  5  buildPythonPackage,
  6  pythonAtLeast,
  7  fetchFromGitHub,
  8  fetchpatch,
  9  symlinkJoin,
 10  autoAddDriverRunpath,
 11
 12  # build system
 13  cmake,
 14  jinja2,
 15  ninja,
 16  packaging,
 17  setuptools,
 18  setuptools-scm,
 19
 20  # dependencies
 21  which,
 22  torch,
 23  outlines,
 24  psutil,
 25  ray,
 26  pandas,
 27  pyarrow,
 28  sentencepiece,
 29  numpy,
 30  transformers,
 31  xformers,
 32  xgrammar,
 33  numba,
 34  fastapi,
 35  uvicorn,
 36  pydantic,
 37  aioprometheus,
 38  pynvml,
 39  openai,
 40  pyzmq,
 41  tiktoken,
 42  torchaudio,
 43  torchvision,
 44  py-cpuinfo,
 45  lm-format-enforcer,
 46  prometheus-fastapi-instrumentator,
 47  cupy,
 48  gguf,
 49  einops,
 50  importlib-metadata,
 51  partial-json-parser,
 52  compressed-tensors,
 53  mistral-common,
 54  msgspec,
 55  numactl,
 56  tokenizers,
 57  oneDNN,
 58  blake3,
 59  depyf,
 60  opencv-python-headless,
 61  cachetools,
 62  llguidance,
 63  python-json-logger,
 64  python-multipart,
 65  llvmPackages,
 66  opentelemetry-sdk,
 67  opentelemetry-api,
 68  opentelemetry-exporter-otlp,
 69  bitsandbytes,
 70  flashinfer,
 71  py-libnuma,
 72
 73  # internal dependency - for overriding in overlays
 74  vllm-flash-attn ? null,
 75
 76  cudaSupport ? torch.cudaSupport,
 77  cudaPackages ? { },
 78  rocmSupport ? torch.rocmSupport,
 79  rocmPackages ? { },
 80  gpuTargets ? [ ],
 81}:
 82
 83let
 84  inherit (lib)
 85    lists
 86    strings
 87    trivial
 88    ;
 89
 90  inherit (cudaPackages) flags;
 91
 92  shouldUsePkg =
 93    pkg: if pkg != null && lib.meta.availableOn stdenv.hostPlatform pkg then pkg else null;
 94
 95  # see CMakeLists.txt, grepping for GIT_TAG near cutlass
 96  # https://github.com/vllm-project/vllm/blob/v${version}/CMakeLists.txt
 97  cutlass = fetchFromGitHub {
 98    owner = "NVIDIA";
 99    repo = "cutlass";
100    tag = "v3.9.2";
101    hash = "sha256-teziPNA9csYvhkG5t2ht8W8x5+1YGGbHm8VKx4JoxgI=";
102  };
103
104  flashmla = stdenv.mkDerivation {
105    pname = "flashmla";
106    # https://github.com/vllm-project/FlashMLA/blob/${src.rev}/setup.py
107    version = "1.0.0";
108
109    # grep for GIT_TAG in the following file
110    # https://github.com/vllm-project/vllm/blob/v${version}/cmake/external_projects/flashmla.cmake
111    src = fetchFromGitHub {
112      owner = "vllm-project";
113      repo = "FlashMLA";
114      rev = "575f7724b9762f265bbee5889df9c7d630801845";
115      hash = "sha256-8WrKMl0olr0nYV4FRJfwSaJ0F5gWQpssoFMjr9tbHBk=";
116    };
117
118    dontConfigure = true;
119
120    # flashmla normally relies on `git submodule update` to fetch cutlass
121    buildPhase = ''
122      rm -rf csrc/cutlass
123      ln -sf ${cutlass} csrc/cutlass
124    '';
125
126    installPhase = ''
127      cp -rva . $out
128    '';
129  };
130
131  vllm-flash-attn' = lib.defaultTo (stdenv.mkDerivation {
132    pname = "vllm-flash-attn";
133    # https://github.com/vllm-project/flash-attention/blob/${src.rev}/vllm_flash_attn/__init__.py
134    version = "2.7.4.post1";
135
136    # grep for GIT_TAG in the following file
137    # https://github.com/vllm-project/vllm/blob/v${version}/cmake/external_projects/vllm_flash_attn.cmake
138    src = fetchFromGitHub {
139      owner = "vllm-project";
140      repo = "flash-attention";
141      rev = "8798f27777fb57f447070301bf33a9f9c607f491";
142      hash = "sha256-UTUvATGN1NU/Bc8qo078q6bEgILLmlrjL7Yk2iAJhg4=";
143    };
144
145    dontConfigure = true;
146
147    # vllm-flash-attn normally relies on `git submodule update` to fetch cutlass and composable_kernel
148    buildPhase = ''
149      rm -rf csrc/cutlass
150      ln -sf ${cutlass} csrc/cutlass
151    ''
152    + lib.optionalString (rocmSupport) ''
153      rm -rf csrc/composable_kernel;
154      ln -sf ${rocmPackages.composable_kernel} csrc/composable_kernel
155    '';
156
157    installPhase = ''
158      cp -rva . $out
159    '';
160  }) vllm-flash-attn;
161
162  cpuSupport = !cudaSupport && !rocmSupport;
163
164  # https://github.com/pytorch/pytorch/blob/v2.7.0/torch/utils/cpp_extension.py#L2343-L2345
165  supportedTorchCudaCapabilities =
166    let
167      real = [
168        "3.5"
169        "3.7"
170        "5.0"
171        "5.2"
172        "5.3"
173        "6.0"
174        "6.1"
175        "6.2"
176        "7.0"
177        "7.2"
178        "7.5"
179        "8.0"
180        "8.6"
181        "8.7"
182        "8.9"
183        "9.0"
184        "9.0a"
185        "10.0"
186        "10.0a"
187        "10.1"
188        "10.1a"
189        "12.0"
190        "12.0a"
191      ];
192      ptx = lists.map (x: "${x}+PTX") real;
193    in
194    real ++ ptx;
195
196  # NOTE: The lists.subtractLists function is perhaps a bit unintuitive. It subtracts the elements
197  #   of the first list *from* the second list. That means:
198  #   lists.subtractLists a b = b - a
199
200  # For CUDA
201  supportedCudaCapabilities = lists.intersectLists flags.cudaCapabilities supportedTorchCudaCapabilities;
202  unsupportedCudaCapabilities = lists.subtractLists supportedCudaCapabilities flags.cudaCapabilities;
203
204  isCudaJetson = cudaSupport && cudaPackages.flags.isJetsonBuild;
205
206  # Use trivial.warnIf to print a warning if any unsupported GPU targets are specified.
207  gpuArchWarner =
208    supported: unsupported:
209    trivial.throwIf (supported == [ ]) (
210      "No supported GPU targets specified. Requested GPU targets: "
211      + strings.concatStringsSep ", " unsupported
212    ) supported;
213
214  # Create the gpuTargetString.
215  gpuTargetString = strings.concatStringsSep ";" (
216    if gpuTargets != [ ] then
217      # If gpuTargets is specified, it always takes priority.
218      gpuTargets
219    else if cudaSupport then
220      gpuArchWarner supportedCudaCapabilities unsupportedCudaCapabilities
221    else if rocmSupport then
222      rocmPackages.clr.gpuTargets
223    else
224      throw "No GPU targets specified"
225  );
226
227  mergedCudaLibraries = with cudaPackages; [
228    cuda_cudart # cuda_runtime.h, -lcudart
229    cuda_cccl
230    libcusparse # cusparse.h
231    libcusolver # cusolverDn.h
232    cuda_nvtx
233    cuda_nvrtc
234    libcublas
235  ];
236
237  # Some packages are not available on all platforms
238  nccl = shouldUsePkg (cudaPackages.nccl or null);
239
240  getAllOutputs = p: [
241    (lib.getBin p)
242    (lib.getLib p)
243    (lib.getDev p)
244  ];
245
246in
247
248buildPythonPackage rec {
249  pname = "vllm";
250  version = "0.9.1";
251  pyproject = true;
252
253  # https://github.com/vllm-project/vllm/issues/12083
254  disabled = pythonAtLeast "3.13";
255
256  stdenv = torch.stdenv;
257
258  src = fetchFromGitHub {
259    owner = "vllm-project";
260    repo = "vllm";
261    tag = "v${version}";
262    hash = "sha256-sp7rDpewTPXTVRBJHJMj+8pJDS6wAu0/OTJZwbPPqKc=";
263  };
264
265  patches = [
266    (fetchpatch {
267      name = "remove-unused-opentelemetry-semantic-conventions-ai-dep.patch";
268      url = "https://github.com/vllm-project/vllm/commit/6a5d7e45f52c3a13de43b8b4fa9033e3b342ebd2.patch";
269      hash = "sha256-KYthqu+6XwsYYd80PtfrMMjuRV9+ionccr7EbjE4jJE=";
270    })
271    (fetchpatch {
272      name = "fall-back-to-gloo-when-nccl-unavailable.patch";
273      url = "https://github.com/vllm-project/vllm/commit/aa131a94410683b0a02e74fed2ce95e6c2b6b030.patch";
274      hash = "sha256-jNlQZQ8xiW85JWyBjsPZ6FoRQsiG1J8bwzmQjnaWFBg=";
275    })
276    ./0002-setup.py-nix-support-respect-cmakeFlags.patch
277    ./0003-propagate-pythonpath.patch
278    ./0004-drop-lsmod.patch
279    ./0005-drop-intel-reqs.patch
280  ];
281
282  postPatch = ''
283    # pythonRelaxDeps does not cover build-system
284    substituteInPlace pyproject.toml \
285      --replace-fail "torch ==" "torch >=" \
286      --replace-fail "setuptools>=77.0.3,<80.0.0" "setuptools"
287
288    # Ignore the python version check because it hard-codes minor versions and
289    # lags behind `ray`'s python interpreter support
290    substituteInPlace CMakeLists.txt \
291      --replace-fail \
292        'set(PYTHON_SUPPORTED_VERSIONS' \
293        'set(PYTHON_SUPPORTED_VERSIONS "${lib.versions.majorMinor python.version}"'
294
295    # Pass build environment PYTHONPATH to vLLM's Python configuration scripts
296    substituteInPlace CMakeLists.txt \
297      --replace-fail '$PYTHONPATH' '$ENV{PYTHONPATH}'
298  '';
299
300  nativeBuildInputs = [
301    which
302  ]
303  ++ lib.optionals rocmSupport [
304    rocmPackages.hipcc
305  ]
306  ++ lib.optionals cudaSupport [
307    cudaPackages.cuda_nvcc
308    autoAddDriverRunpath
309  ]
310  ++ lib.optionals isCudaJetson [
311    cudaPackages.autoAddCudaCompatRunpath
312  ];
313
314  build-system = [
315    cmake
316    jinja2
317    ninja
318    packaging
319    setuptools
320    setuptools-scm
321    torch
322  ];
323
324  buildInputs =
325    lib.optionals cpuSupport [
326      oneDNN
327    ]
328    ++ lib.optionals (cpuSupport && stdenv.hostPlatform.isLinux) [
329      numactl
330    ]
331    ++ lib.optionals cudaSupport (
332      mergedCudaLibraries
333      ++ (with cudaPackages; [
334        nccl
335        cudnn
336        libcufile
337      ])
338    )
339    ++ lib.optionals rocmSupport (
340      with rocmPackages;
341      [
342        clr
343        rocthrust
344        rocprim
345        hipsparse
346        hipblas
347      ]
348    )
349    ++ lib.optionals stdenv.cc.isClang [
350      llvmPackages.openmp
351    ];
352
353  dependencies = [
354    aioprometheus
355    blake3
356    cachetools
357    depyf
358    fastapi
359    llguidance
360    lm-format-enforcer
361    numpy
362    openai
363    opencv-python-headless
364    outlines
365    pandas
366    prometheus-fastapi-instrumentator
367    py-cpuinfo
368    pyarrow
369    pydantic
370    python-json-logger
371    python-multipart
372    pyzmq
373    ray
374    sentencepiece
375    tiktoken
376    tokenizers
377    msgspec
378    gguf
379    einops
380    importlib-metadata
381    partial-json-parser
382    compressed-tensors
383    mistral-common
384    torch
385    torchaudio
386    torchvision
387    transformers
388    uvicorn
389    xformers
390    xgrammar
391    numba
392    opentelemetry-sdk
393    opentelemetry-api
394    opentelemetry-exporter-otlp
395    bitsandbytes
396    # vLLM needs Torch's compiler to be present in order to use torch.compile
397    torch.stdenv.cc
398  ]
399  ++ uvicorn.optional-dependencies.standard
400  ++ aioprometheus.optional-dependencies.starlette
401  ++ lib.optionals stdenv.targetPlatform.isLinux [
402    py-libnuma
403    psutil
404  ]
405  ++ lib.optionals cudaSupport [
406    cupy
407    pynvml
408    flashinfer
409  ];
410
411  dontUseCmakeConfigure = true;
412  cmakeFlags = [
413  ]
414  ++ lib.optionals cudaSupport [
415    (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_CUTLASS" "${lib.getDev cutlass}")
416    (lib.cmakeFeature "FLASH_MLA_SRC_DIR" "${lib.getDev flashmla}")
417    (lib.cmakeFeature "VLLM_FLASH_ATTN_SRC_DIR" "${lib.getDev vllm-flash-attn'}")
418    (lib.cmakeFeature "TORCH_CUDA_ARCH_LIST" "${gpuTargetString}")
419    (lib.cmakeFeature "CUTLASS_NVCC_ARCHS_ENABLED" "${cudaPackages.flags.cmakeCudaArchitecturesString}")
420    (lib.cmakeFeature "CUDA_TOOLKIT_ROOT_DIR" "${symlinkJoin {
421      name = "cuda-merged-${cudaPackages.cudaMajorMinorVersion}";
422      paths = builtins.concatMap getAllOutputs mergedCudaLibraries;
423    }}")
424    (lib.cmakeFeature "CAFFE2_USE_CUDNN" "ON")
425    (lib.cmakeFeature "CAFFE2_USE_CUFILE" "ON")
426    (lib.cmakeFeature "CUTLASS_ENABLE_CUBLAS" "ON")
427  ]
428  ++ lib.optionals cpuSupport [
429    (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_ONEDNN" "${lib.getDev oneDNN}")
430  ];
431
432  env =
433    lib.optionalAttrs cudaSupport {
434      VLLM_TARGET_DEVICE = "cuda";
435      CUDA_HOME = "${lib.getDev cudaPackages.cuda_nvcc}";
436    }
437    // lib.optionalAttrs rocmSupport {
438      VLLM_TARGET_DEVICE = "rocm";
439      # Otherwise it tries to enumerate host supported ROCM gfx archs, and that is not possible due to sandboxing.
440      PYTORCH_ROCM_ARCH = lib.strings.concatStringsSep ";" rocmPackages.clr.gpuTargets;
441      ROCM_HOME = "${rocmPackages.clr}";
442    }
443    // lib.optionalAttrs cpuSupport {
444      VLLM_TARGET_DEVICE = "cpu";
445    };
446
447  preConfigure = ''
448    # See: https://github.com/vllm-project/vllm/blob/v0.7.1/setup.py#L75-L109
449    # There's also NVCC_THREADS but Nix/Nixpkgs doesn't really have this concept.
450    export MAX_JOBS="$NIX_BUILD_CORES"
451  '';
452
453  pythonRelaxDeps = true;
454
455  pythonImportsCheck = [ "vllm" ];
456
457  passthru = {
458    # make internal dependency available to overlays
459    vllm-flash-attn = vllm-flash-attn';
460    # updates the cutlass fetcher instead
461    skipBulkUpdate = true;
462  };
463
464  meta = {
465    description = "High-throughput and memory-efficient inference and serving engine for LLMs";
466    changelog = "https://github.com/vllm-project/vllm/releases/tag/v${version}";
467    homepage = "https://github.com/vllm-project/vllm";
468    license = lib.licenses.asl20;
469    maintainers = with lib.maintainers; [
470      happysalada
471      lach
472    ];
473    badPlatforms = [
474      # CMake Error at cmake/cpu_extension.cmake:78 (find_isa):
475      # find_isa Function invoked with incorrect arguments for function named:
476      # find_isa
477      "x86_64-darwin"
478    ];
479  };
480}