1{
2 lib,
3 stdenv,
4 python,
5 buildPythonPackage,
6 fetchFromGitHub,
7 fetchpatch,
8 symlinkJoin,
9 autoAddDriverRunpath,
10
11 # build system
12 cmake,
13 jinja2,
14 ninja,
15 packaging,
16 setuptools,
17 setuptools-scm,
18
19 # dependencies
20 which,
21 torch,
22 outlines,
23 psutil,
24 ray,
25 pandas,
26 pyarrow,
27 sentencepiece,
28 numpy,
29 transformers,
30 xformers,
31 xgrammar,
32 numba,
33 fastapi,
34 uvicorn,
35 pydantic,
36 aioprometheus,
37 pynvml,
38 openai,
39 pyzmq,
40 tiktoken,
41 torchaudio,
42 torchvision,
43 py-cpuinfo,
44 lm-format-enforcer,
45 prometheus-fastapi-instrumentator,
46 cupy,
47 cbor2,
48 pybase64,
49 gguf,
50 einops,
51 importlib-metadata,
52 partial-json-parser,
53 compressed-tensors,
54 mistral-common,
55 msgspec,
56 numactl,
57 tokenizers,
58 oneDNN,
59 blake3,
60 depyf,
61 opencv-python-headless,
62 cachetools,
63 llguidance,
64 python-json-logger,
65 python-multipart,
66 llvmPackages,
67 opentelemetry-sdk,
68 opentelemetry-api,
69 opentelemetry-exporter-otlp,
70 bitsandbytes,
71 flashinfer,
72 py-libnuma,
73 setproctitle,
74 openai-harmony,
75
76 # internal dependency - for overriding in overlays
77 vllm-flash-attn ? null,
78
79 cudaSupport ? torch.cudaSupport,
80 cudaPackages ? { },
81 rocmSupport ? torch.rocmSupport,
82 rocmPackages ? { },
83 gpuTargets ? [ ],
84}:
85
86let
87 inherit (lib)
88 lists
89 strings
90 trivial
91 ;
92
93 inherit (cudaPackages) flags;
94
95 shouldUsePkg =
96 pkg: if pkg != null && lib.meta.availableOn stdenv.hostPlatform pkg then pkg else null;
97
98 # see CMakeLists.txt, grepping for CUTLASS_REVISION
99 # https://github.com/vllm-project/vllm/blob/v${version}/CMakeLists.txt
100 cutlass = fetchFromGitHub {
101 owner = "NVIDIA";
102 repo = "cutlass";
103 tag = "v4.0.0";
104 hash = "sha256-HJY+Go1viPkSVZPEs/NyMtYJzas4mMLiIZF3kNX+WgA=";
105 };
106
107 flashmla = stdenv.mkDerivation {
108 pname = "flashmla";
109 # https://github.com/vllm-project/FlashMLA/blob/${src.rev}/setup.py
110 version = "1.0.0";
111
112 # grep for GIT_TAG in the following file
113 # https://github.com/vllm-project/vllm/blob/v${version}/cmake/external_projects/flashmla.cmake
114 src = fetchFromGitHub {
115 owner = "vllm-project";
116 repo = "FlashMLA";
117 rev = "5f65b85703c7ed75fda01e06495077caad207c3f";
118 hash = "sha256-DO9EFNSoAgyfRRc095v1UjT+Zdzk4cFY0+n28FVEwI0=";
119 };
120
121 dontConfigure = true;
122
123 # flashmla normally relies on `git submodule update` to fetch cutlass
124 buildPhase = ''
125 rm -rf csrc/cutlass
126 ln -sf ${cutlass} csrc/cutlass
127 '';
128
129 installPhase = ''
130 cp -rva . $out
131 '';
132 };
133
134 vllm-flash-attn' = lib.defaultTo (stdenv.mkDerivation {
135 pname = "vllm-flash-attn";
136 # https://github.com/vllm-project/flash-attention/blob/${src.rev}/vllm_flash_attn/__init__.py
137 version = "2.7.2.post1";
138
139 # grep for GIT_TAG in the following file
140 # https://github.com/vllm-project/vllm/blob/v${version}/cmake/external_projects/vllm_flash_attn.cmake
141 src = fetchFromGitHub {
142 owner = "vllm-project";
143 repo = "flash-attention";
144 rev = "ee4d25bd84e0cbc7e0b9b9685085fd5db2dcb62a";
145 hash = "sha256-2r0Habd/kBpvM4/aQFIYyj+uQAa3M9gjk3DcBZHFNfA=";
146 };
147
148 patches = [
149 # fix Hopper build failure
150 # https://github.com/Dao-AILab/flash-attention/pull/1719
151 # https://github.com/Dao-AILab/flash-attention/pull/1723
152 (fetchpatch {
153 url = "https://github.com/Dao-AILab/flash-attention/commit/dad67c88d4b6122c69d0bed1cebded0cded71cea.patch";
154 hash = "sha256-JSgXWItOp5KRpFbTQj/cZk+Tqez+4mEz5kmH5EUeQN4=";
155 })
156 (fetchpatch {
157 url = "https://github.com/Dao-AILab/flash-attention/commit/e26dd28e487117ee3e6bc4908682f41f31e6f83a.patch";
158 hash = "sha256-NkCEowXSi+tiWu74Qt+VPKKavx0H9JeteovSJKToK9A=";
159 })
160 ];
161
162 dontConfigure = true;
163
164 # vllm-flash-attn normally relies on `git submodule update` to fetch cutlass and composable_kernel
165 buildPhase = ''
166 rm -rf csrc/cutlass
167 ln -sf ${cutlass} csrc/cutlass
168 ''
169 + lib.optionalString rocmSupport ''
170 rm -rf csrc/composable_kernel;
171 ln -sf ${rocmPackages.composable_kernel} csrc/composable_kernel
172 '';
173
174 installPhase = ''
175 cp -rva . $out
176 '';
177 }) vllm-flash-attn;
178
179 cpuSupport = !cudaSupport && !rocmSupport;
180
181 # https://github.com/pytorch/pytorch/blob/v2.8.0/torch/utils/cpp_extension.py#L2411-L2414
182 supportedTorchCudaCapabilities =
183 let
184 real = [
185 "3.5"
186 "3.7"
187 "5.0"
188 "5.2"
189 "5.3"
190 "6.0"
191 "6.1"
192 "6.2"
193 "7.0"
194 "7.2"
195 "7.5"
196 "8.0"
197 "8.6"
198 "8.7"
199 "8.9"
200 "9.0"
201 "9.0a"
202 # Blackwell (SM100+) capabilities temporarily disabled due to CUTLASS API incompatibility
203 # FlashMLA kernels require CUTLASS v4.2.1+ APIs not available in bundled v4.0.0
204 # TODO: Re-enable when vLLM upgrades CUTLASS (see https://github.com/vllm-project/vllm/pull/24673)
205 # "10.0"
206 # "10.0a"
207 # "10.1"
208 # "10.1a"
209 # "10.3"
210 # "10.3a"
211 # "12.0"
212 # "12.0a"
213 # "12.1"
214 # "12.1a"
215 ];
216 ptx = lists.map (x: "${x}+PTX") real;
217 in
218 real ++ ptx;
219
220 # NOTE: The lists.subtractLists function is perhaps a bit unintuitive. It subtracts the elements
221 # of the first list *from* the second list. That means:
222 # lists.subtractLists a b = b - a
223
224 # For CUDA
225 supportedCudaCapabilities = lists.intersectLists flags.cudaCapabilities supportedTorchCudaCapabilities;
226 unsupportedCudaCapabilities = lists.subtractLists supportedCudaCapabilities flags.cudaCapabilities;
227
228 isCudaJetson = cudaSupport && cudaPackages.flags.isJetsonBuild;
229
230 # Use trivial.warnIf to print a warning if any unsupported GPU targets are specified.
231 gpuArchWarner =
232 supported: unsupported:
233 trivial.throwIf (supported == [ ]) (
234 "No supported GPU targets specified. Requested GPU targets: "
235 + strings.concatStringsSep ", " unsupported
236 ) supported;
237
238 # Create the gpuTargetString.
239 gpuTargetString = strings.concatStringsSep ";" (
240 if gpuTargets != [ ] then
241 # If gpuTargets is specified, it always takes priority.
242 gpuTargets
243 else if cudaSupport then
244 gpuArchWarner supportedCudaCapabilities unsupportedCudaCapabilities
245 else if rocmSupport then
246 rocmPackages.clr.gpuTargets
247 else
248 throw "No GPU targets specified"
249 );
250
251 mergedCudaLibraries = with cudaPackages; [
252 cuda_cudart # cuda_runtime.h, -lcudart
253 cuda_cccl
254 libcurand # curand_kernel.h
255 libcusparse # cusparse.h
256 libcusolver # cusolverDn.h
257 cuda_nvtx
258 cuda_nvrtc
259 # cusparselt # cusparseLt.h
260 libcublas
261 ];
262
263 # Some packages are not available on all platforms
264 nccl = shouldUsePkg (cudaPackages.nccl or null);
265
266 getAllOutputs = p: [
267 (lib.getBin p)
268 (lib.getLib p)
269 (lib.getDev p)
270 ];
271
272in
273
274buildPythonPackage rec {
275 pname = "vllm";
276 version = "0.11.0";
277 pyproject = true;
278
279 stdenv = torch.stdenv;
280
281 src = fetchFromGitHub {
282 owner = "vllm-project";
283 repo = "vllm";
284 tag = "v${version}";
285 hash = "sha256-uYK/e9McEyrDTACMk5S0cGCjai9rf6HMR9dpPL7ISYc=";
286 };
287
288 patches = [
289 ./0002-setup.py-nix-support-respect-cmakeFlags.patch
290 ./0003-propagate-pythonpath.patch
291 ./0005-drop-intel-reqs.patch
292 # TODO: Remove the below patches when included in vLLM release
293 (fetchpatch {
294 url = "https://github.com/vllm-project/vllm/commit/9705fba7b727a3b9c275b012258608531e2223d1.patch";
295 hash = "sha256-DxRGLiwkegMlMjqFmFc0igpaVv06/Y2WjL+ISoIOET4=";
296 })
297 # patch above is previous commit needed to apply patch below
298 # oneDNN / CPU fix from https://github.com/vllm-project/vllm/pull/26401
299 (fetchpatch {
300 url = "https://github.com/vllm-project/vllm/commit/d7be1f2a480bdc62a6a1ec0126a401e3d42985fe.patch";
301 hash = "sha256-Zi1k5wiOPjsbWHFKpcLq9Ns43wIP37Mbvesi5K80zaQ=";
302 })
303 ];
304
305 postPatch = ''
306 # pythonRelaxDeps does not cover build-system
307 substituteInPlace pyproject.toml \
308 --replace-fail "torch ==" "torch >=" \
309 --replace-fail "setuptools>=77.0.3,<80.0.0" "setuptools"
310
311 # Ignore the python version check because it hard-codes minor versions and
312 # lags behind `ray`'s python interpreter support
313 substituteInPlace CMakeLists.txt \
314 --replace-fail \
315 'set(PYTHON_SUPPORTED_VERSIONS' \
316 'set(PYTHON_SUPPORTED_VERSIONS "${lib.versions.majorMinor python.version}"'
317
318 # Pass build environment PYTHONPATH to vLLM's Python configuration scripts
319 substituteInPlace CMakeLists.txt \
320 --replace-fail '$PYTHONPATH' '$ENV{PYTHONPATH}'
321 '';
322
323 nativeBuildInputs = [
324 which
325 ]
326 ++ lib.optionals rocmSupport [
327 rocmPackages.hipcc
328 ]
329 ++ lib.optionals cudaSupport [
330 cudaPackages.cuda_nvcc
331 autoAddDriverRunpath
332 ]
333 ++ lib.optionals isCudaJetson [
334 cudaPackages.autoAddCudaCompatRunpath
335 ];
336
337 build-system = [
338 cmake
339 jinja2
340 ninja
341 packaging
342 setuptools
343 setuptools-scm
344 torch
345 ];
346
347 buildInputs =
348 lib.optionals cpuSupport [
349 oneDNN
350 ]
351 ++ lib.optionals (cpuSupport && stdenv.hostPlatform.isLinux) [
352 numactl
353 ]
354 ++ lib.optionals cudaSupport (
355 mergedCudaLibraries
356 ++ (with cudaPackages; [
357 nccl
358 cudnn
359 libcufile
360 ])
361 )
362 ++ lib.optionals rocmSupport (
363 with rocmPackages;
364 [
365 clr
366 rocthrust
367 rocprim
368 hipsparse
369 hipblas
370 ]
371 )
372 ++ lib.optionals stdenv.cc.isClang [
373 llvmPackages.openmp
374 ];
375
376 dependencies = [
377 aioprometheus
378 blake3
379 cachetools
380 cbor2
381 depyf
382 fastapi
383 llguidance
384 lm-format-enforcer
385 numpy
386 openai
387 opencv-python-headless
388 outlines
389 pandas
390 prometheus-fastapi-instrumentator
391 py-cpuinfo
392 pyarrow
393 pybase64
394 pydantic
395 python-json-logger
396 python-multipart
397 pyzmq
398 ray
399 sentencepiece
400 tiktoken
401 tokenizers
402 msgspec
403 gguf
404 einops
405 importlib-metadata
406 partial-json-parser
407 compressed-tensors
408 mistral-common
409 torch
410 torchaudio
411 torchvision
412 transformers
413 uvicorn
414 xformers
415 xgrammar
416 numba
417 opentelemetry-sdk
418 opentelemetry-api
419 opentelemetry-exporter-otlp
420 bitsandbytes
421 setproctitle
422 openai-harmony
423 # vLLM needs Torch's compiler to be present in order to use torch.compile
424 torch.stdenv.cc
425 ]
426 ++ uvicorn.optional-dependencies.standard
427 ++ aioprometheus.optional-dependencies.starlette
428 ++ lib.optionals stdenv.targetPlatform.isLinux [
429 py-libnuma
430 psutil
431 ]
432 ++ lib.optionals cudaSupport [
433 cupy
434 pynvml
435 flashinfer
436 ];
437
438 dontUseCmakeConfigure = true;
439 cmakeFlags = [
440 ]
441 ++ lib.optionals cudaSupport [
442 (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_CUTLASS" "${lib.getDev cutlass}")
443 (lib.cmakeFeature "FLASH_MLA_SRC_DIR" "${lib.getDev flashmla}")
444 (lib.cmakeFeature "VLLM_FLASH_ATTN_SRC_DIR" "${lib.getDev vllm-flash-attn'}")
445 (lib.cmakeFeature "TORCH_CUDA_ARCH_LIST" "${gpuTargetString}")
446 (lib.cmakeFeature "CUTLASS_NVCC_ARCHS_ENABLED" "${cudaPackages.flags.cmakeCudaArchitecturesString}")
447 (lib.cmakeFeature "CUDA_TOOLKIT_ROOT_DIR" "${symlinkJoin {
448 name = "cuda-merged-${cudaPackages.cudaMajorMinorVersion}";
449 paths = builtins.concatMap getAllOutputs mergedCudaLibraries;
450 }}")
451 (lib.cmakeFeature "CAFFE2_USE_CUDNN" "ON")
452 (lib.cmakeFeature "CAFFE2_USE_CUFILE" "ON")
453 (lib.cmakeFeature "CUTLASS_ENABLE_CUBLAS" "ON")
454 ];
455
456 env =
457 lib.optionalAttrs cudaSupport {
458 VLLM_TARGET_DEVICE = "cuda";
459 CUDA_HOME = "${lib.getDev cudaPackages.cuda_nvcc}";
460 }
461 // lib.optionalAttrs rocmSupport {
462 VLLM_TARGET_DEVICE = "rocm";
463 # Otherwise it tries to enumerate host supported ROCM gfx archs, and that is not possible due to sandboxing.
464 PYTORCH_ROCM_ARCH = lib.strings.concatStringsSep ";" rocmPackages.clr.gpuTargets;
465 ROCM_HOME = "${rocmPackages.clr}";
466 }
467 // lib.optionalAttrs cpuSupport {
468 VLLM_TARGET_DEVICE = "cpu";
469 FETCHCONTENT_SOURCE_DIR_ONEDNN = "${oneDNN.src}";
470 };
471
472 preConfigure = ''
473 # See: https://github.com/vllm-project/vllm/blob/v0.7.1/setup.py#L75-L109
474 # There's also NVCC_THREADS but Nix/Nixpkgs doesn't really have this concept.
475 export MAX_JOBS="$NIX_BUILD_CORES"
476 '';
477
478 pythonRelaxDeps = true;
479
480 pythonImportsCheck = [ "vllm" ];
481
482 passthru = {
483 # make internal dependency available to overlays
484 vllm-flash-attn = vllm-flash-attn';
485 # updates the cutlass fetcher instead
486 skipBulkUpdate = true;
487 };
488
489 meta = {
490 description = "High-throughput and memory-efficient inference and serving engine for LLMs";
491 changelog = "https://github.com/vllm-project/vllm/releases/tag/v${version}";
492 homepage = "https://github.com/vllm-project/vllm";
493 license = lib.licenses.asl20;
494 maintainers = with lib.maintainers; [
495 happysalada
496 lach
497 daniel-fahey
498 ];
499 badPlatforms = [
500 # CMake Error at cmake/cpu_extension.cmake:188 (message):
501 # vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA, ARMv8 or
502 # RISC-V support.
503 "aarch64-darwin"
504
505 # CMake Error at cmake/cpu_extension.cmake:78 (find_isa):
506 # find_isa Function invoked with incorrect arguments for function named:
507 # find_isa
508 "x86_64-darwin"
509 ];
510 };
511}