pkgs/development/python-modules/vllm/default.nix at master · tjh.dev/nixpkgs

tjh.dev / nixpkgs
Clone of https://github.com/NixOS/nixpkgs.git (to stress-test knotserver)
nixpkgs / pkgs / development / python-modules / vllm / default.nix
at master 13 kB view raw
  1{
  2  lib,
  3  stdenv,
  4  python,
  5  buildPythonPackage,
  6  fetchFromGitHub,
  7  fetchpatch,
  8  symlinkJoin,
  9  autoAddDriverRunpath,
 10
 11  # build system
 12  cmake,
 13  jinja2,
 14  ninja,
 15  packaging,
 16  setuptools,
 17  setuptools-scm,
 18
 19  # dependencies
 20  which,
 21  torch,
 22  outlines,
 23  psutil,
 24  ray,
 25  pandas,
 26  pyarrow,
 27  sentencepiece,
 28  numpy,
 29  transformers,
 30  xformers,
 31  xgrammar,
 32  numba,
 33  fastapi,
 34  uvicorn,
 35  pydantic,
 36  aioprometheus,
 37  pynvml,
 38  openai,
 39  pyzmq,
 40  tiktoken,
 41  torchaudio,
 42  torchvision,
 43  py-cpuinfo,
 44  lm-format-enforcer,
 45  prometheus-fastapi-instrumentator,
 46  cupy,
 47  cbor2,
 48  pybase64,
 49  gguf,
 50  einops,
 51  importlib-metadata,
 52  partial-json-parser,
 53  compressed-tensors,
 54  mistral-common,
 55  msgspec,
 56  numactl,
 57  tokenizers,
 58  oneDNN,
 59  blake3,
 60  depyf,
 61  opencv-python-headless,
 62  cachetools,
 63  llguidance,
 64  python-json-logger,
 65  python-multipart,
 66  llvmPackages,
 67  opentelemetry-sdk,
 68  opentelemetry-api,
 69  opentelemetry-exporter-otlp,
 70  bitsandbytes,
 71  flashinfer,
 72  py-libnuma,
 73  setproctitle,
 74  openai-harmony,
 75
 76  # internal dependency - for overriding in overlays
 77  vllm-flash-attn ? null,
 78
 79  cudaSupport ? torch.cudaSupport,
 80  cudaPackages ? { },
 81  rocmSupport ? torch.rocmSupport,
 82  rocmPackages ? { },
 83  gpuTargets ? [ ],
 84}:
 85
 86let
 87  inherit (lib)
 88    lists
 89    strings
 90    trivial
 91    ;
 92
 93  inherit (cudaPackages) flags;
 94
 95  shouldUsePkg =
 96    pkg: if pkg != null && lib.meta.availableOn stdenv.hostPlatform pkg then pkg else null;
 97
 98  # see CMakeLists.txt, grepping for CUTLASS_REVISION
 99  # https://github.com/vllm-project/vllm/blob/v${version}/CMakeLists.txt
100  cutlass = fetchFromGitHub {
101    owner = "NVIDIA";
102    repo = "cutlass";
103    tag = "v4.0.0";
104    hash = "sha256-HJY+Go1viPkSVZPEs/NyMtYJzas4mMLiIZF3kNX+WgA=";
105  };
106
107  flashmla = stdenv.mkDerivation {
108    pname = "flashmla";
109    # https://github.com/vllm-project/FlashMLA/blob/${src.rev}/setup.py
110    version = "1.0.0";
111
112    # grep for GIT_TAG in the following file
113    # https://github.com/vllm-project/vllm/blob/v${version}/cmake/external_projects/flashmla.cmake
114    src = fetchFromGitHub {
115      owner = "vllm-project";
116      repo = "FlashMLA";
117      rev = "5f65b85703c7ed75fda01e06495077caad207c3f";
118      hash = "sha256-DO9EFNSoAgyfRRc095v1UjT+Zdzk4cFY0+n28FVEwI0=";
119    };
120
121    dontConfigure = true;
122
123    # flashmla normally relies on `git submodule update` to fetch cutlass
124    buildPhase = ''
125      rm -rf csrc/cutlass
126      ln -sf ${cutlass} csrc/cutlass
127    '';
128
129    installPhase = ''
130      cp -rva . $out
131    '';
132  };
133
134  vllm-flash-attn' = lib.defaultTo (stdenv.mkDerivation {
135    pname = "vllm-flash-attn";
136    # https://github.com/vllm-project/flash-attention/blob/${src.rev}/vllm_flash_attn/__init__.py
137    version = "2.7.2.post1";
138
139    # grep for GIT_TAG in the following file
140    # https://github.com/vllm-project/vllm/blob/v${version}/cmake/external_projects/vllm_flash_attn.cmake
141    src = fetchFromGitHub {
142      owner = "vllm-project";
143      repo = "flash-attention";
144      rev = "ee4d25bd84e0cbc7e0b9b9685085fd5db2dcb62a";
145      hash = "sha256-2r0Habd/kBpvM4/aQFIYyj+uQAa3M9gjk3DcBZHFNfA=";
146    };
147
148    patches = [
149      # fix Hopper build failure
150      # https://github.com/Dao-AILab/flash-attention/pull/1719
151      # https://github.com/Dao-AILab/flash-attention/pull/1723
152      (fetchpatch {
153        url = "https://github.com/Dao-AILab/flash-attention/commit/dad67c88d4b6122c69d0bed1cebded0cded71cea.patch";
154        hash = "sha256-JSgXWItOp5KRpFbTQj/cZk+Tqez+4mEz5kmH5EUeQN4=";
155      })
156      (fetchpatch {
157        url = "https://github.com/Dao-AILab/flash-attention/commit/e26dd28e487117ee3e6bc4908682f41f31e6f83a.patch";
158        hash = "sha256-NkCEowXSi+tiWu74Qt+VPKKavx0H9JeteovSJKToK9A=";
159      })
160    ];
161
162    dontConfigure = true;
163
164    # vllm-flash-attn normally relies on `git submodule update` to fetch cutlass and composable_kernel
165    buildPhase = ''
166      rm -rf csrc/cutlass
167      ln -sf ${cutlass} csrc/cutlass
168    ''
169    + lib.optionalString rocmSupport ''
170      rm -rf csrc/composable_kernel;
171      ln -sf ${rocmPackages.composable_kernel} csrc/composable_kernel
172    '';
173
174    installPhase = ''
175      cp -rva . $out
176    '';
177  }) vllm-flash-attn;
178
179  cpuSupport = !cudaSupport && !rocmSupport;
180
181  # https://github.com/pytorch/pytorch/blob/v2.8.0/torch/utils/cpp_extension.py#L2411-L2414
182  supportedTorchCudaCapabilities =
183    let
184      real = [
185        "3.5"
186        "3.7"
187        "5.0"
188        "5.2"
189        "5.3"
190        "6.0"
191        "6.1"
192        "6.2"
193        "7.0"
194        "7.2"
195        "7.5"
196        "8.0"
197        "8.6"
198        "8.7"
199        "8.9"
200        "9.0"
201        "9.0a"
202        # Blackwell (SM100+) capabilities temporarily disabled due to CUTLASS API incompatibility
203        # FlashMLA kernels require CUTLASS v4.2.1+ APIs not available in bundled v4.0.0
204        # TODO: Re-enable when vLLM upgrades CUTLASS (see https://github.com/vllm-project/vllm/pull/24673)
205        # "10.0"
206        # "10.0a"
207        # "10.1"
208        # "10.1a"
209        # "10.3"
210        # "10.3a"
211        # "12.0"
212        # "12.0a"
213        # "12.1"
214        # "12.1a"
215      ];
216      ptx = lists.map (x: "${x}+PTX") real;
217    in
218    real ++ ptx;
219
220  # NOTE: The lists.subtractLists function is perhaps a bit unintuitive. It subtracts the elements
221  #   of the first list *from* the second list. That means:
222  #   lists.subtractLists a b = b - a
223
224  # For CUDA
225  supportedCudaCapabilities = lists.intersectLists flags.cudaCapabilities supportedTorchCudaCapabilities;
226  unsupportedCudaCapabilities = lists.subtractLists supportedCudaCapabilities flags.cudaCapabilities;
227
228  isCudaJetson = cudaSupport && cudaPackages.flags.isJetsonBuild;
229
230  # Use trivial.warnIf to print a warning if any unsupported GPU targets are specified.
231  gpuArchWarner =
232    supported: unsupported:
233    trivial.throwIf (supported == [ ]) (
234      "No supported GPU targets specified. Requested GPU targets: "
235      + strings.concatStringsSep ", " unsupported
236    ) supported;
237
238  # Create the gpuTargetString.
239  gpuTargetString = strings.concatStringsSep ";" (
240    if gpuTargets != [ ] then
241      # If gpuTargets is specified, it always takes priority.
242      gpuTargets
243    else if cudaSupport then
244      gpuArchWarner supportedCudaCapabilities unsupportedCudaCapabilities
245    else if rocmSupport then
246      rocmPackages.clr.gpuTargets
247    else
248      throw "No GPU targets specified"
249  );
250
251  mergedCudaLibraries = with cudaPackages; [
252    cuda_cudart # cuda_runtime.h, -lcudart
253    cuda_cccl
254    libcurand # curand_kernel.h
255    libcusparse # cusparse.h
256    libcusolver # cusolverDn.h
257    cuda_nvtx
258    cuda_nvrtc
259    # cusparselt # cusparseLt.h
260    libcublas
261  ];
262
263  # Some packages are not available on all platforms
264  nccl = shouldUsePkg (cudaPackages.nccl or null);
265
266  getAllOutputs = p: [
267    (lib.getBin p)
268    (lib.getLib p)
269    (lib.getDev p)
270  ];
271
272in
273
274buildPythonPackage rec {
275  pname = "vllm";
276  version = "0.11.0";
277  pyproject = true;
278
279  stdenv = torch.stdenv;
280
281  src = fetchFromGitHub {
282    owner = "vllm-project";
283    repo = "vllm";
284    tag = "v${version}";
285    hash = "sha256-uYK/e9McEyrDTACMk5S0cGCjai9rf6HMR9dpPL7ISYc=";
286  };
287
288  patches = [
289    ./0002-setup.py-nix-support-respect-cmakeFlags.patch
290    ./0003-propagate-pythonpath.patch
291    ./0005-drop-intel-reqs.patch
292    # TODO: Remove the below patches when included in vLLM release
293    (fetchpatch {
294      url = "https://github.com/vllm-project/vllm/commit/9705fba7b727a3b9c275b012258608531e2223d1.patch";
295      hash = "sha256-DxRGLiwkegMlMjqFmFc0igpaVv06/Y2WjL+ISoIOET4=";
296    })
297    # patch above is previous commit needed to apply patch below
298    # oneDNN / CPU fix from https://github.com/vllm-project/vllm/pull/26401
299    (fetchpatch {
300      url = "https://github.com/vllm-project/vllm/commit/d7be1f2a480bdc62a6a1ec0126a401e3d42985fe.patch";
301      hash = "sha256-Zi1k5wiOPjsbWHFKpcLq9Ns43wIP37Mbvesi5K80zaQ=";
302    })
303  ];
304
305  postPatch = ''
306    # pythonRelaxDeps does not cover build-system
307    substituteInPlace pyproject.toml \
308      --replace-fail "torch ==" "torch >=" \
309      --replace-fail "setuptools>=77.0.3,<80.0.0" "setuptools"
310
311    # Ignore the python version check because it hard-codes minor versions and
312    # lags behind `ray`'s python interpreter support
313    substituteInPlace CMakeLists.txt \
314      --replace-fail \
315        'set(PYTHON_SUPPORTED_VERSIONS' \
316        'set(PYTHON_SUPPORTED_VERSIONS "${lib.versions.majorMinor python.version}"'
317
318    # Pass build environment PYTHONPATH to vLLM's Python configuration scripts
319    substituteInPlace CMakeLists.txt \
320      --replace-fail '$PYTHONPATH' '$ENV{PYTHONPATH}'
321  '';
322
323  nativeBuildInputs = [
324    which
325  ]
326  ++ lib.optionals rocmSupport [
327    rocmPackages.hipcc
328  ]
329  ++ lib.optionals cudaSupport [
330    cudaPackages.cuda_nvcc
331    autoAddDriverRunpath
332  ]
333  ++ lib.optionals isCudaJetson [
334    cudaPackages.autoAddCudaCompatRunpath
335  ];
336
337  build-system = [
338    cmake
339    jinja2
340    ninja
341    packaging
342    setuptools
343    setuptools-scm
344    torch
345  ];
346
347  buildInputs =
348    lib.optionals cpuSupport [
349      oneDNN
350    ]
351    ++ lib.optionals (cpuSupport && stdenv.hostPlatform.isLinux) [
352      numactl
353    ]
354    ++ lib.optionals cudaSupport (
355      mergedCudaLibraries
356      ++ (with cudaPackages; [
357        nccl
358        cudnn
359        libcufile
360      ])
361    )
362    ++ lib.optionals rocmSupport (
363      with rocmPackages;
364      [
365        clr
366        rocthrust
367        rocprim
368        hipsparse
369        hipblas
370      ]
371    )
372    ++ lib.optionals stdenv.cc.isClang [
373      llvmPackages.openmp
374    ];
375
376  dependencies = [
377    aioprometheus
378    blake3
379    cachetools
380    cbor2
381    depyf
382    fastapi
383    llguidance
384    lm-format-enforcer
385    numpy
386    openai
387    opencv-python-headless
388    outlines
389    pandas
390    prometheus-fastapi-instrumentator
391    py-cpuinfo
392    pyarrow
393    pybase64
394    pydantic
395    python-json-logger
396    python-multipart
397    pyzmq
398    ray
399    sentencepiece
400    tiktoken
401    tokenizers
402    msgspec
403    gguf
404    einops
405    importlib-metadata
406    partial-json-parser
407    compressed-tensors
408    mistral-common
409    torch
410    torchaudio
411    torchvision
412    transformers
413    uvicorn
414    xformers
415    xgrammar
416    numba
417    opentelemetry-sdk
418    opentelemetry-api
419    opentelemetry-exporter-otlp
420    bitsandbytes
421    setproctitle
422    openai-harmony
423    # vLLM needs Torch's compiler to be present in order to use torch.compile
424    torch.stdenv.cc
425  ]
426  ++ uvicorn.optional-dependencies.standard
427  ++ aioprometheus.optional-dependencies.starlette
428  ++ lib.optionals stdenv.targetPlatform.isLinux [
429    py-libnuma
430    psutil
431  ]
432  ++ lib.optionals cudaSupport [
433    cupy
434    pynvml
435    flashinfer
436  ];
437
438  dontUseCmakeConfigure = true;
439  cmakeFlags = [
440  ]
441  ++ lib.optionals cudaSupport [
442    (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_CUTLASS" "${lib.getDev cutlass}")
443    (lib.cmakeFeature "FLASH_MLA_SRC_DIR" "${lib.getDev flashmla}")
444    (lib.cmakeFeature "VLLM_FLASH_ATTN_SRC_DIR" "${lib.getDev vllm-flash-attn'}")
445    (lib.cmakeFeature "TORCH_CUDA_ARCH_LIST" "${gpuTargetString}")
446    (lib.cmakeFeature "CUTLASS_NVCC_ARCHS_ENABLED" "${cudaPackages.flags.cmakeCudaArchitecturesString}")
447    (lib.cmakeFeature "CUDA_TOOLKIT_ROOT_DIR" "${symlinkJoin {
448      name = "cuda-merged-${cudaPackages.cudaMajorMinorVersion}";
449      paths = builtins.concatMap getAllOutputs mergedCudaLibraries;
450    }}")
451    (lib.cmakeFeature "CAFFE2_USE_CUDNN" "ON")
452    (lib.cmakeFeature "CAFFE2_USE_CUFILE" "ON")
453    (lib.cmakeFeature "CUTLASS_ENABLE_CUBLAS" "ON")
454  ];
455
456  env =
457    lib.optionalAttrs cudaSupport {
458      VLLM_TARGET_DEVICE = "cuda";
459      CUDA_HOME = "${lib.getDev cudaPackages.cuda_nvcc}";
460    }
461    // lib.optionalAttrs rocmSupport {
462      VLLM_TARGET_DEVICE = "rocm";
463      # Otherwise it tries to enumerate host supported ROCM gfx archs, and that is not possible due to sandboxing.
464      PYTORCH_ROCM_ARCH = lib.strings.concatStringsSep ";" rocmPackages.clr.gpuTargets;
465      ROCM_HOME = "${rocmPackages.clr}";
466    }
467    // lib.optionalAttrs cpuSupport {
468      VLLM_TARGET_DEVICE = "cpu";
469      FETCHCONTENT_SOURCE_DIR_ONEDNN = "${oneDNN.src}";
470    };
471
472  preConfigure = ''
473    # See: https://github.com/vllm-project/vllm/blob/v0.7.1/setup.py#L75-L109
474    # There's also NVCC_THREADS but Nix/Nixpkgs doesn't really have this concept.
475    export MAX_JOBS="$NIX_BUILD_CORES"
476  '';
477
478  pythonRelaxDeps = true;
479
480  pythonImportsCheck = [ "vllm" ];
481
482  passthru = {
483    # make internal dependency available to overlays
484    vllm-flash-attn = vllm-flash-attn';
485    # updates the cutlass fetcher instead
486    skipBulkUpdate = true;
487  };
488
489  meta = {
490    description = "High-throughput and memory-efficient inference and serving engine for LLMs";
491    changelog = "https://github.com/vllm-project/vllm/releases/tag/v${version}";
492    homepage = "https://github.com/vllm-project/vllm";
493    license = lib.licenses.asl20;
494    maintainers = with lib.maintainers; [
495      happysalada
496      lach
497      daniel-fahey
498    ];
499    badPlatforms = [
500      # CMake Error at cmake/cpu_extension.cmake:188 (message):
501      #   vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA, ARMv8 or
502      #   RISC-V support.
503      "aarch64-darwin"
504
505      # CMake Error at cmake/cpu_extension.cmake:78 (find_isa):
506      # find_isa Function invoked with incorrect arguments for function named:
507      # find_isa
508      "x86_64-darwin"
509    ];
510  };
511}