1{
2 lib,
3 stdenv,
4 python,
5 buildPythonPackage,
6 pythonAtLeast,
7 fetchFromGitHub,
8 fetchpatch,
9 symlinkJoin,
10 autoAddDriverRunpath,
11
12 # build system
13 cmake,
14 jinja2,
15 ninja,
16 packaging,
17 setuptools,
18 setuptools-scm,
19
20 # dependencies
21 which,
22 torch,
23 outlines,
24 psutil,
25 ray,
26 pandas,
27 pyarrow,
28 sentencepiece,
29 numpy,
30 transformers,
31 xformers,
32 xgrammar,
33 numba,
34 fastapi,
35 uvicorn,
36 pydantic,
37 aioprometheus,
38 pynvml,
39 openai,
40 pyzmq,
41 tiktoken,
42 torchaudio,
43 torchvision,
44 py-cpuinfo,
45 lm-format-enforcer,
46 prometheus-fastapi-instrumentator,
47 cupy,
48 gguf,
49 einops,
50 importlib-metadata,
51 partial-json-parser,
52 compressed-tensors,
53 mistral-common,
54 msgspec,
55 numactl,
56 tokenizers,
57 oneDNN,
58 blake3,
59 depyf,
60 opencv-python-headless,
61 cachetools,
62 llguidance,
63 python-json-logger,
64 python-multipart,
65 llvmPackages,
66 opentelemetry-sdk,
67 opentelemetry-api,
68 opentelemetry-exporter-otlp,
69 bitsandbytes,
70 flashinfer,
71 py-libnuma,
72
73 # internal dependency - for overriding in overlays
74 vllm-flash-attn ? null,
75
76 cudaSupport ? torch.cudaSupport,
77 cudaPackages ? { },
78 rocmSupport ? torch.rocmSupport,
79 rocmPackages ? { },
80 gpuTargets ? [ ],
81}:
82
83let
84 inherit (lib)
85 lists
86 strings
87 trivial
88 ;
89
90 inherit (cudaPackages) flags;
91
92 shouldUsePkg =
93 pkg: if pkg != null && lib.meta.availableOn stdenv.hostPlatform pkg then pkg else null;
94
95 # see CMakeLists.txt, grepping for GIT_TAG near cutlass
96 # https://github.com/vllm-project/vllm/blob/v${version}/CMakeLists.txt
97 cutlass = fetchFromGitHub {
98 owner = "NVIDIA";
99 repo = "cutlass";
100 tag = "v3.9.2";
101 hash = "sha256-teziPNA9csYvhkG5t2ht8W8x5+1YGGbHm8VKx4JoxgI=";
102 };
103
104 flashmla = stdenv.mkDerivation {
105 pname = "flashmla";
106 # https://github.com/vllm-project/FlashMLA/blob/${src.rev}/setup.py
107 version = "1.0.0";
108
109 # grep for GIT_TAG in the following file
110 # https://github.com/vllm-project/vllm/blob/v${version}/cmake/external_projects/flashmla.cmake
111 src = fetchFromGitHub {
112 owner = "vllm-project";
113 repo = "FlashMLA";
114 rev = "575f7724b9762f265bbee5889df9c7d630801845";
115 hash = "sha256-8WrKMl0olr0nYV4FRJfwSaJ0F5gWQpssoFMjr9tbHBk=";
116 };
117
118 dontConfigure = true;
119
120 # flashmla normally relies on `git submodule update` to fetch cutlass
121 buildPhase = ''
122 rm -rf csrc/cutlass
123 ln -sf ${cutlass} csrc/cutlass
124 '';
125
126 installPhase = ''
127 cp -rva . $out
128 '';
129 };
130
131 vllm-flash-attn' = lib.defaultTo (stdenv.mkDerivation {
132 pname = "vllm-flash-attn";
133 # https://github.com/vllm-project/flash-attention/blob/${src.rev}/vllm_flash_attn/__init__.py
134 version = "2.7.4.post1";
135
136 # grep for GIT_TAG in the following file
137 # https://github.com/vllm-project/vllm/blob/v${version}/cmake/external_projects/vllm_flash_attn.cmake
138 src = fetchFromGitHub {
139 owner = "vllm-project";
140 repo = "flash-attention";
141 rev = "8798f27777fb57f447070301bf33a9f9c607f491";
142 hash = "sha256-UTUvATGN1NU/Bc8qo078q6bEgILLmlrjL7Yk2iAJhg4=";
143 };
144
145 dontConfigure = true;
146
147 # vllm-flash-attn normally relies on `git submodule update` to fetch cutlass and composable_kernel
148 buildPhase = ''
149 rm -rf csrc/cutlass
150 ln -sf ${cutlass} csrc/cutlass
151 ''
152 + lib.optionalString (rocmSupport) ''
153 rm -rf csrc/composable_kernel;
154 ln -sf ${rocmPackages.composable_kernel} csrc/composable_kernel
155 '';
156
157 installPhase = ''
158 cp -rva . $out
159 '';
160 }) vllm-flash-attn;
161
162 cpuSupport = !cudaSupport && !rocmSupport;
163
164 # https://github.com/pytorch/pytorch/blob/v2.7.0/torch/utils/cpp_extension.py#L2343-L2345
165 supportedTorchCudaCapabilities =
166 let
167 real = [
168 "3.5"
169 "3.7"
170 "5.0"
171 "5.2"
172 "5.3"
173 "6.0"
174 "6.1"
175 "6.2"
176 "7.0"
177 "7.2"
178 "7.5"
179 "8.0"
180 "8.6"
181 "8.7"
182 "8.9"
183 "9.0"
184 "9.0a"
185 "10.0"
186 "10.0a"
187 "10.1"
188 "10.1a"
189 "12.0"
190 "12.0a"
191 ];
192 ptx = lists.map (x: "${x}+PTX") real;
193 in
194 real ++ ptx;
195
196 # NOTE: The lists.subtractLists function is perhaps a bit unintuitive. It subtracts the elements
197 # of the first list *from* the second list. That means:
198 # lists.subtractLists a b = b - a
199
200 # For CUDA
201 supportedCudaCapabilities = lists.intersectLists flags.cudaCapabilities supportedTorchCudaCapabilities;
202 unsupportedCudaCapabilities = lists.subtractLists supportedCudaCapabilities flags.cudaCapabilities;
203
204 isCudaJetson = cudaSupport && cudaPackages.flags.isJetsonBuild;
205
206 # Use trivial.warnIf to print a warning if any unsupported GPU targets are specified.
207 gpuArchWarner =
208 supported: unsupported:
209 trivial.throwIf (supported == [ ]) (
210 "No supported GPU targets specified. Requested GPU targets: "
211 + strings.concatStringsSep ", " unsupported
212 ) supported;
213
214 # Create the gpuTargetString.
215 gpuTargetString = strings.concatStringsSep ";" (
216 if gpuTargets != [ ] then
217 # If gpuTargets is specified, it always takes priority.
218 gpuTargets
219 else if cudaSupport then
220 gpuArchWarner supportedCudaCapabilities unsupportedCudaCapabilities
221 else if rocmSupport then
222 rocmPackages.clr.gpuTargets
223 else
224 throw "No GPU targets specified"
225 );
226
227 mergedCudaLibraries = with cudaPackages; [
228 cuda_cudart # cuda_runtime.h, -lcudart
229 cuda_cccl
230 libcusparse # cusparse.h
231 libcusolver # cusolverDn.h
232 cuda_nvtx
233 cuda_nvrtc
234 libcublas
235 ];
236
237 # Some packages are not available on all platforms
238 nccl = shouldUsePkg (cudaPackages.nccl or null);
239
240 getAllOutputs = p: [
241 (lib.getBin p)
242 (lib.getLib p)
243 (lib.getDev p)
244 ];
245
246in
247
248buildPythonPackage rec {
249 pname = "vllm";
250 version = "0.9.1";
251 pyproject = true;
252
253 # https://github.com/vllm-project/vllm/issues/12083
254 disabled = pythonAtLeast "3.13";
255
256 stdenv = torch.stdenv;
257
258 src = fetchFromGitHub {
259 owner = "vllm-project";
260 repo = "vllm";
261 tag = "v${version}";
262 hash = "sha256-sp7rDpewTPXTVRBJHJMj+8pJDS6wAu0/OTJZwbPPqKc=";
263 };
264
265 patches = [
266 (fetchpatch {
267 name = "remove-unused-opentelemetry-semantic-conventions-ai-dep.patch";
268 url = "https://github.com/vllm-project/vllm/commit/6a5d7e45f52c3a13de43b8b4fa9033e3b342ebd2.patch";
269 hash = "sha256-KYthqu+6XwsYYd80PtfrMMjuRV9+ionccr7EbjE4jJE=";
270 })
271 (fetchpatch {
272 name = "fall-back-to-gloo-when-nccl-unavailable.patch";
273 url = "https://github.com/vllm-project/vllm/commit/aa131a94410683b0a02e74fed2ce95e6c2b6b030.patch";
274 hash = "sha256-jNlQZQ8xiW85JWyBjsPZ6FoRQsiG1J8bwzmQjnaWFBg=";
275 })
276 ./0002-setup.py-nix-support-respect-cmakeFlags.patch
277 ./0003-propagate-pythonpath.patch
278 ./0004-drop-lsmod.patch
279 ./0005-drop-intel-reqs.patch
280 ];
281
282 postPatch = ''
283 # pythonRelaxDeps does not cover build-system
284 substituteInPlace pyproject.toml \
285 --replace-fail "torch ==" "torch >=" \
286 --replace-fail "setuptools>=77.0.3,<80.0.0" "setuptools"
287
288 # Ignore the python version check because it hard-codes minor versions and
289 # lags behind `ray`'s python interpreter support
290 substituteInPlace CMakeLists.txt \
291 --replace-fail \
292 'set(PYTHON_SUPPORTED_VERSIONS' \
293 'set(PYTHON_SUPPORTED_VERSIONS "${lib.versions.majorMinor python.version}"'
294
295 # Pass build environment PYTHONPATH to vLLM's Python configuration scripts
296 substituteInPlace CMakeLists.txt \
297 --replace-fail '$PYTHONPATH' '$ENV{PYTHONPATH}'
298 '';
299
300 nativeBuildInputs = [
301 which
302 ]
303 ++ lib.optionals rocmSupport [
304 rocmPackages.hipcc
305 ]
306 ++ lib.optionals cudaSupport [
307 cudaPackages.cuda_nvcc
308 autoAddDriverRunpath
309 ]
310 ++ lib.optionals isCudaJetson [
311 cudaPackages.autoAddCudaCompatRunpath
312 ];
313
314 build-system = [
315 cmake
316 jinja2
317 ninja
318 packaging
319 setuptools
320 setuptools-scm
321 torch
322 ];
323
324 buildInputs =
325 lib.optionals cpuSupport [
326 oneDNN
327 ]
328 ++ lib.optionals (cpuSupport && stdenv.hostPlatform.isLinux) [
329 numactl
330 ]
331 ++ lib.optionals cudaSupport (
332 mergedCudaLibraries
333 ++ (with cudaPackages; [
334 nccl
335 cudnn
336 libcufile
337 ])
338 )
339 ++ lib.optionals rocmSupport (
340 with rocmPackages;
341 [
342 clr
343 rocthrust
344 rocprim
345 hipsparse
346 hipblas
347 ]
348 )
349 ++ lib.optionals stdenv.cc.isClang [
350 llvmPackages.openmp
351 ];
352
353 dependencies = [
354 aioprometheus
355 blake3
356 cachetools
357 depyf
358 fastapi
359 llguidance
360 lm-format-enforcer
361 numpy
362 openai
363 opencv-python-headless
364 outlines
365 pandas
366 prometheus-fastapi-instrumentator
367 py-cpuinfo
368 pyarrow
369 pydantic
370 python-json-logger
371 python-multipart
372 pyzmq
373 ray
374 sentencepiece
375 tiktoken
376 tokenizers
377 msgspec
378 gguf
379 einops
380 importlib-metadata
381 partial-json-parser
382 compressed-tensors
383 mistral-common
384 torch
385 torchaudio
386 torchvision
387 transformers
388 uvicorn
389 xformers
390 xgrammar
391 numba
392 opentelemetry-sdk
393 opentelemetry-api
394 opentelemetry-exporter-otlp
395 bitsandbytes
396 # vLLM needs Torch's compiler to be present in order to use torch.compile
397 torch.stdenv.cc
398 ]
399 ++ uvicorn.optional-dependencies.standard
400 ++ aioprometheus.optional-dependencies.starlette
401 ++ lib.optionals stdenv.targetPlatform.isLinux [
402 py-libnuma
403 psutil
404 ]
405 ++ lib.optionals cudaSupport [
406 cupy
407 pynvml
408 flashinfer
409 ];
410
411 dontUseCmakeConfigure = true;
412 cmakeFlags = [
413 ]
414 ++ lib.optionals cudaSupport [
415 (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_CUTLASS" "${lib.getDev cutlass}")
416 (lib.cmakeFeature "FLASH_MLA_SRC_DIR" "${lib.getDev flashmla}")
417 (lib.cmakeFeature "VLLM_FLASH_ATTN_SRC_DIR" "${lib.getDev vllm-flash-attn'}")
418 (lib.cmakeFeature "TORCH_CUDA_ARCH_LIST" "${gpuTargetString}")
419 (lib.cmakeFeature "CUTLASS_NVCC_ARCHS_ENABLED" "${cudaPackages.flags.cmakeCudaArchitecturesString}")
420 (lib.cmakeFeature "CUDA_TOOLKIT_ROOT_DIR" "${symlinkJoin {
421 name = "cuda-merged-${cudaPackages.cudaMajorMinorVersion}";
422 paths = builtins.concatMap getAllOutputs mergedCudaLibraries;
423 }}")
424 (lib.cmakeFeature "CAFFE2_USE_CUDNN" "ON")
425 (lib.cmakeFeature "CAFFE2_USE_CUFILE" "ON")
426 (lib.cmakeFeature "CUTLASS_ENABLE_CUBLAS" "ON")
427 ]
428 ++ lib.optionals cpuSupport [
429 (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_ONEDNN" "${lib.getDev oneDNN}")
430 ];
431
432 env =
433 lib.optionalAttrs cudaSupport {
434 VLLM_TARGET_DEVICE = "cuda";
435 CUDA_HOME = "${lib.getDev cudaPackages.cuda_nvcc}";
436 }
437 // lib.optionalAttrs rocmSupport {
438 VLLM_TARGET_DEVICE = "rocm";
439 # Otherwise it tries to enumerate host supported ROCM gfx archs, and that is not possible due to sandboxing.
440 PYTORCH_ROCM_ARCH = lib.strings.concatStringsSep ";" rocmPackages.clr.gpuTargets;
441 ROCM_HOME = "${rocmPackages.clr}";
442 }
443 // lib.optionalAttrs cpuSupport {
444 VLLM_TARGET_DEVICE = "cpu";
445 };
446
447 preConfigure = ''
448 # See: https://github.com/vllm-project/vllm/blob/v0.7.1/setup.py#L75-L109
449 # There's also NVCC_THREADS but Nix/Nixpkgs doesn't really have this concept.
450 export MAX_JOBS="$NIX_BUILD_CORES"
451 '';
452
453 pythonRelaxDeps = true;
454
455 pythonImportsCheck = [ "vllm" ];
456
457 passthru = {
458 # make internal dependency available to overlays
459 vllm-flash-attn = vllm-flash-attn';
460 # updates the cutlass fetcher instead
461 skipBulkUpdate = true;
462 };
463
464 meta = {
465 description = "High-throughput and memory-efficient inference and serving engine for LLMs";
466 changelog = "https://github.com/vllm-project/vllm/releases/tag/v${version}";
467 homepage = "https://github.com/vllm-project/vllm";
468 license = lib.licenses.asl20;
469 maintainers = with lib.maintainers; [
470 happysalada
471 lach
472 ];
473 badPlatforms = [
474 # CMake Error at cmake/cpu_extension.cmake:78 (find_isa):
475 # find_isa Function invoked with incorrect arguments for function named:
476 # find_isa
477 "x86_64-darwin"
478 ];
479 };
480}