nixpkgs mirror (for testing)
github.com/NixOS/nixpkgs
nix
1{
2 lib,
3 stdenv,
4 python,
5 buildPythonPackage,
6 pythonRelaxDepsHook,
7 fetchFromGitHub,
8 symlinkJoin,
9 autoAddDriverRunpath,
10
11 # build system
12 cmake,
13 jinja2,
14 ninja,
15 packaging,
16 setuptools,
17 setuptools-scm,
18 wheel,
19
20 # dependencies
21 which,
22 torch,
23 outlines,
24 psutil,
25 ray,
26 pandas,
27 pyarrow,
28 sentencepiece,
29 numpy,
30 transformers,
31 xformers,
32 xgrammar,
33 numba,
34 fastapi,
35 uvicorn,
36 pydantic,
37 aioprometheus,
38 pynvml,
39 openai,
40 pyzmq,
41 tiktoken,
42 torchaudio,
43 torchvision,
44 py-cpuinfo,
45 lm-format-enforcer,
46 prometheus-fastapi-instrumentator,
47 cupy,
48 gguf,
49 einops,
50 importlib-metadata,
51 partial-json-parser,
52 compressed-tensors,
53 mistral-common,
54 msgspec,
55 numactl,
56 tokenizers,
57 oneDNN,
58 blake3,
59 depyf,
60 opencv-python-headless,
61 cachetools,
62 llguidance,
63 python-json-logger,
64 python-multipart,
65 llvmPackages,
66
67 cudaSupport ? torch.cudaSupport,
68 cudaPackages ? { },
69 rocmSupport ? torch.rocmSupport,
70 rocmPackages ? { },
71 gpuTargets ? [ ],
72}:
73
74let
75 inherit (lib)
76 lists
77 strings
78 trivial
79 ;
80
81 inherit (cudaPackages) flags;
82
83 shouldUsePkg =
84 pkg: if pkg != null && lib.meta.availableOn stdenv.hostPlatform pkg then pkg else null;
85
86 # see CMakeLists.txt, grepping for GIT_TAG near cutlass
87 # https://github.com/vllm-project/vllm/blob/v${version}/CMakeLists.txt
88 cutlass = fetchFromGitHub {
89 owner = "NVIDIA";
90 repo = "cutlass";
91 tag = "v3.8.0";
92 hash = "sha256-oIzlbKRdOh6gp6nRZ8udLSqleBFoFtgM7liCBlHZLOk=";
93 };
94
95 flashmla = stdenv.mkDerivation {
96 pname = "flashmla";
97 # https://github.com/vllm-project/FlashMLA/blob/${src.rev}/setup.py
98 version = "1.0.0";
99
100 # grep for GIT_TAG in the following file
101 # https://github.com/vllm-project/vllm/blob/v${version}/cmake/external_projects/flashmla.cmake
102 src = fetchFromGitHub {
103 owner = "vllm-project";
104 repo = "FlashMLA";
105 rev = "575f7724b9762f265bbee5889df9c7d630801845";
106 hash = "sha256-8WrKMl0olr0nYV4FRJfwSaJ0F5gWQpssoFMjr9tbHBk=";
107 };
108
109 dontConfigure = true;
110
111 # flashmla normally relies on `git submodule update` to fetch cutlass
112 buildPhase = ''
113 rm -rf csrc/cutlass
114 ln -sf ${cutlass} csrc/cutlass
115 '';
116
117 installPhase = ''
118 cp -rva . $out
119 '';
120 };
121
122 vllm-flash-attn = stdenv.mkDerivation {
123 pname = "vllm-flash-attn";
124 # https://github.com/vllm-project/flash-attention/blob/${src.rev}/vllm_flash_attn/__init__.py
125 version = "2.7.2.post1";
126
127 # grep for GIT_TAG in the following file
128 # https://github.com/vllm-project/vllm/blob/v${version}/cmake/external_projects/vllm_flash_attn.cmake
129 src = fetchFromGitHub {
130 owner = "vllm-project";
131 repo = "flash-attention";
132 rev = "dc9d410b3e2d6534a4c70724c2515f4def670a22";
133 hash = "sha256-ZQ0bOBIb+8IMmya8dmimKQ17KTBplX81IirdnBJpX5M=";
134 };
135
136 dontConfigure = true;
137
138 # vllm-flash-attn normally relies on `git submodule update` to fetch cutlass
139 buildPhase = ''
140 rm -rf csrc/cutlass
141 ln -sf ${cutlass} csrc/cutlass
142 '';
143
144 installPhase = ''
145 cp -rva . $out
146 '';
147 };
148
149 cpuSupport = !cudaSupport && !rocmSupport;
150
151 # https://github.com/pytorch/pytorch/blob/v2.6.0/torch/utils/cpp_extension.py#L2046-L2048
152 supportedTorchCudaCapabilities =
153 let
154 real = [
155 "3.5"
156 "3.7"
157 "5.0"
158 "5.2"
159 "5.3"
160 "6.0"
161 "6.1"
162 "6.2"
163 "7.0"
164 "7.2"
165 "7.5"
166 "8.0"
167 "8.6"
168 "8.7"
169 "8.9"
170 "9.0"
171 "9.0a"
172 "10.0"
173 ];
174 ptx = lists.map (x: "${x}+PTX") real;
175 in
176 real ++ ptx;
177
178 # NOTE: The lists.subtractLists function is perhaps a bit unintuitive. It subtracts the elements
179 # of the first list *from* the second list. That means:
180 # lists.subtractLists a b = b - a
181
182 # For CUDA
183 supportedCudaCapabilities = lists.intersectLists flags.cudaCapabilities supportedTorchCudaCapabilities;
184 unsupportedCudaCapabilities = lists.subtractLists supportedCudaCapabilities flags.cudaCapabilities;
185
186 isCudaJetson = cudaSupport && cudaPackages.flags.isJetsonBuild;
187
188 # Use trivial.warnIf to print a warning if any unsupported GPU targets are specified.
189 gpuArchWarner =
190 supported: unsupported:
191 trivial.throwIf (supported == [ ]) (
192 "No supported GPU targets specified. Requested GPU targets: "
193 + strings.concatStringsSep ", " unsupported
194 ) supported;
195
196 # Create the gpuTargetString.
197 gpuTargetString = strings.concatStringsSep ";" (
198 if gpuTargets != [ ] then
199 # If gpuTargets is specified, it always takes priority.
200 gpuTargets
201 else if cudaSupport then
202 gpuArchWarner supportedCudaCapabilities unsupportedCudaCapabilities
203 else if rocmSupport then
204 rocmPackages.clr.gpuTargets
205 else
206 throw "No GPU targets specified"
207 );
208
209 mergedCudaLibraries = with cudaPackages; [
210 cuda_cudart # cuda_runtime.h, -lcudart
211 cuda_cccl
212 libcusparse # cusparse.h
213 libcusolver # cusolverDn.h
214 cuda_nvtx
215 cuda_nvrtc
216 libcublas
217 ];
218
219 # Some packages are not available on all platforms
220 nccl = shouldUsePkg (cudaPackages.nccl or null);
221
222 getAllOutputs = p: [
223 (lib.getBin p)
224 (lib.getLib p)
225 (lib.getDev p)
226 ];
227
228in
229
230buildPythonPackage rec {
231 pname = "vllm";
232 version = "0.8.3";
233 pyproject = true;
234
235 stdenv = torch.stdenv;
236
237 src = fetchFromGitHub {
238 owner = "vllm-project";
239 repo = "vllm";
240 tag = "v${version}";
241 hash = "sha256-LiEBkVwJTT4WoCTk9pI0ykTjmv1pDMzksmFwVktoxMY=";
242 };
243
244 patches = [
245 ./0002-setup.py-nix-support-respect-cmakeFlags.patch
246 ./0003-propagate-pythonpath.patch
247 ./0004-drop-lsmod.patch
248 ];
249
250 postPatch =
251 ''
252 # pythonRelaxDeps does not cover build-system
253 substituteInPlace pyproject.toml \
254 --replace-fail "torch ==" "torch >="
255
256 # Ignore the python version check because it hard-codes minor versions and
257 # lags behind `ray`'s python interpreter support
258 substituteInPlace CMakeLists.txt \
259 --replace-fail \
260 'set(PYTHON_SUPPORTED_VERSIONS' \
261 'set(PYTHON_SUPPORTED_VERSIONS "${lib.versions.majorMinor python.version}"'
262 ''
263 + lib.optionalString (nccl == null) ''
264 # On platforms where NCCL is not supported (e.g. Jetson), substitute Gloo (provided by Torch)
265 substituteInPlace vllm/distributed/parallel_state.py \
266 --replace-fail '"nccl"' '"gloo"'
267 '';
268
269 nativeBuildInputs =
270 [
271 which
272 ]
273 ++ lib.optionals rocmSupport [
274 rocmPackages.hipcc
275 ]
276 ++ lib.optionals cudaSupport [
277 cudaPackages.cuda_nvcc
278 autoAddDriverRunpath
279 ]
280 ++ lib.optionals isCudaJetson [
281 cudaPackages.autoAddCudaCompatRunpath
282 ];
283
284 build-system = [
285 cmake
286 jinja2
287 ninja
288 packaging
289 setuptools
290 setuptools-scm
291 torch
292 ];
293
294 buildInputs =
295 lib.optionals cpuSupport [
296 oneDNN
297 ]
298 ++ lib.optionals (cpuSupport && stdenv.isLinux) [
299 numactl
300 ]
301 ++ lib.optionals cudaSupport (
302 mergedCudaLibraries
303 ++ (with cudaPackages; [
304 nccl
305 cudnn
306 libcufile
307 ])
308 )
309 ++ lib.optionals rocmSupport (
310 with rocmPackages;
311 [
312 clr
313 rocthrust
314 rocprim
315 hipsparse
316 hipblas
317 ]
318 )
319 ++ lib.optionals stdenv.cc.isClang [
320 llvmPackages.openmp
321 ];
322
323 dependencies =
324 [
325 aioprometheus
326 blake3
327 cachetools
328 depyf
329 fastapi
330 llguidance
331 lm-format-enforcer
332 numpy
333 openai
334 opencv-python-headless
335 outlines
336 pandas
337 prometheus-fastapi-instrumentator
338 psutil
339 py-cpuinfo
340 pyarrow
341 pydantic
342 python-json-logger
343 python-multipart
344 pyzmq
345 ray
346 sentencepiece
347 tiktoken
348 tokenizers
349 msgspec
350 gguf
351 einops
352 importlib-metadata
353 partial-json-parser
354 compressed-tensors
355 mistral-common
356 torch
357 torchaudio
358 torchvision
359 transformers
360 uvicorn
361 xformers
362 xgrammar
363 numba
364 ]
365 ++ uvicorn.optional-dependencies.standard
366 ++ aioprometheus.optional-dependencies.starlette
367 ++ lib.optionals cudaSupport [
368 cupy
369 pynvml
370 ];
371
372 dontUseCmakeConfigure = true;
373 cmakeFlags =
374 [
375 (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_CUTLASS" "${lib.getDev cutlass}")
376 (lib.cmakeFeature "FLASH_MLA_SRC_DIR" "${lib.getDev flashmla}")
377 (lib.cmakeFeature "VLLM_FLASH_ATTN_SRC_DIR" "${lib.getDev vllm-flash-attn}")
378 ]
379 ++ lib.optionals cudaSupport [
380 (lib.cmakeFeature "TORCH_CUDA_ARCH_LIST" "${gpuTargetString}")
381 (lib.cmakeFeature "CUTLASS_NVCC_ARCHS_ENABLED" "${cudaPackages.flags.cmakeCudaArchitecturesString}")
382 (lib.cmakeFeature "CUDA_TOOLKIT_ROOT_DIR" "${symlinkJoin {
383 name = "cuda-merged-${cudaPackages.cudaMajorMinorVersion}";
384 paths = builtins.concatMap getAllOutputs mergedCudaLibraries;
385 }}")
386 (lib.cmakeFeature "CAFFE2_USE_CUDNN" "ON")
387 (lib.cmakeFeature "CAFFE2_USE_CUFILE" "ON")
388 (lib.cmakeFeature "CUTLASS_ENABLE_CUBLAS" "ON")
389 ]
390 ++ lib.optionals cpuSupport [
391 (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_ONEDNN" "${lib.getDev oneDNN}")
392 ];
393
394 env =
395 lib.optionalAttrs cudaSupport {
396 VLLM_TARGET_DEVICE = "cuda";
397 CUDA_HOME = "${lib.getDev cudaPackages.cuda_nvcc}";
398 }
399 // lib.optionalAttrs rocmSupport {
400 VLLM_TARGET_DEVICE = "rocm";
401 # Otherwise it tries to enumerate host supported ROCM gfx archs, and that is not possible due to sandboxing.
402 PYTORCH_ROCM_ARCH = lib.strings.concatStringsSep ";" rocmPackages.clr.gpuTargets;
403 ROCM_HOME = "${rocmPackages.clr}";
404 }
405 // lib.optionalAttrs cpuSupport {
406 VLLM_TARGET_DEVICE = "cpu";
407 };
408
409 preConfigure = ''
410 # See: https://github.com/vllm-project/vllm/blob/v0.7.1/setup.py#L75-L109
411 # There's also NVCC_THREADS but Nix/Nixpkgs doesn't really have this concept.
412 export MAX_JOBS="$NIX_BUILD_CORES"
413 '';
414
415 pythonRelaxDeps = true;
416
417 pythonImportsCheck = [ "vllm" ];
418
419 # updates the cutlass fetcher instead
420 passthru.skipBulkUpdate = true;
421
422 meta = with lib; {
423 description = "High-throughput and memory-efficient inference and serving engine for LLMs";
424 changelog = "https://github.com/vllm-project/vllm/releases/tag/v${version}";
425 homepage = "https://github.com/vllm-project/vllm";
426 license = licenses.asl20;
427 maintainers = with maintainers; [
428 happysalada
429 lach
430 ];
431 badPlatforms = [
432 # CMake Error at cmake/cpu_extension.cmake:78 (find_isa):
433 # find_isa Function invoked with incorrect arguments for function named:
434 # find_isa
435 "x86_64-darwin"
436 ];
437 };
438}