1{
2 lib,
3 stdenv,
4 python,
5 buildPythonPackage,
6 pythonRelaxDepsHook,
7 fetchFromGitHub,
8 symlinkJoin,
9 autoAddDriverRunpath,
10
11 # build system
12 packaging,
13 setuptools,
14 wheel,
15
16 # dependencies
17 which,
18 ninja,
19 cmake,
20 setuptools-scm,
21 torch,
22 outlines,
23 psutil,
24 ray,
25 pandas,
26 pyarrow,
27 sentencepiece,
28 numpy,
29 transformers,
30 xformers,
31 xgrammar,
32 numba,
33 fastapi,
34 uvicorn,
35 pydantic,
36 aioprometheus,
37 pynvml,
38 openai,
39 pyzmq,
40 tiktoken,
41 torchaudio,
42 torchvision,
43 py-cpuinfo,
44 lm-format-enforcer,
45 prometheus-fastapi-instrumentator,
46 cupy,
47 gguf,
48 einops,
49 importlib-metadata,
50 partial-json-parser,
51 compressed-tensors,
52 mistral-common,
53 msgspec,
54 numactl,
55 tokenizers,
56 oneDNN,
57 blake3,
58 depyf,
59 opencv-python-headless,
60 cachetools,
61 llguidance,
62 python-json-logger,
63 python-multipart,
64 llvmPackages,
65
66 cudaSupport ? torch.cudaSupport,
67 cudaPackages ? { },
68 rocmSupport ? torch.rocmSupport,
69 rocmPackages ? { },
70 gpuTargets ? [ ],
71}:
72
73let
74 inherit (lib)
75 lists
76 strings
77 trivial
78 ;
79
80 inherit (cudaPackages) flags;
81
82 shouldUsePkg =
83 pkg: if pkg != null && lib.meta.availableOn stdenv.hostPlatform pkg then pkg else null;
84
85 # see CMakeLists.txt, grepping for GIT_TAG near cutlass
86 # https://github.com/vllm-project/vllm/blob/v${version}/CMakeLists.txt
87 cutlass = fetchFromGitHub {
88 owner = "NVIDIA";
89 repo = "cutlass";
90 tag = "v3.8.0";
91 hash = "sha256-oIzlbKRdOh6gp6nRZ8udLSqleBFoFtgM7liCBlHZLOk=";
92 };
93
94 flashmla = stdenv.mkDerivation {
95 pname = "flashmla";
96 # https://github.com/vllm-project/FlashMLA/blob/${src.rev}/setup.py
97 version = "1.0.0";
98
99 # grep for GIT_TAG in the following file
100 # https://github.com/vllm-project/vllm/blob/v${version}/cmake/external_projects/flashmla.cmake
101 src = fetchFromGitHub {
102 owner = "vllm-project";
103 repo = "FlashMLA";
104 rev = "575f7724b9762f265bbee5889df9c7d630801845";
105 hash = "sha256-8WrKMl0olr0nYV4FRJfwSaJ0F5gWQpssoFMjr9tbHBk=";
106 };
107
108 dontConfigure = true;
109
110 # flashmla normally relies on `git submodule update` to fetch cutlass
111 buildPhase = ''
112 rm -rf csrc/cutlass
113 ln -sf ${cutlass} csrc/cutlass
114 '';
115
116 installPhase = ''
117 cp -rva . $out
118 '';
119 };
120
121 vllm-flash-attn = stdenv.mkDerivation {
122 pname = "vllm-flash-attn";
123 # https://github.com/vllm-project/flash-attention/blob/${src.rev}/vllm_flash_attn/__init__.py
124 version = "2.7.2.post1";
125
126 # grep for GIT_TAG in the following file
127 # https://github.com/vllm-project/vllm/blob/v${version}/cmake/external_projects/vllm_flash_attn.cmake
128 src = fetchFromGitHub {
129 owner = "vllm-project";
130 repo = "flash-attention";
131 rev = "dc9d410b3e2d6534a4c70724c2515f4def670a22";
132 hash = "sha256-ZQ0bOBIb+8IMmya8dmimKQ17KTBplX81IirdnBJpX5M=";
133 };
134
135 dontConfigure = true;
136
137 # vllm-flash-attn normally relies on `git submodule update` to fetch cutlass
138 buildPhase = ''
139 rm -rf csrc/cutlass
140 ln -sf ${cutlass} csrc/cutlass
141 '';
142
143 installPhase = ''
144 cp -rva . $out
145 '';
146 };
147
148 cpuSupport = !cudaSupport && !rocmSupport;
149
150 # https://github.com/pytorch/pytorch/blob/v2.6.0/torch/utils/cpp_extension.py#L2046-L2048
151 supportedTorchCudaCapabilities =
152 let
153 real = [
154 "3.5"
155 "3.7"
156 "5.0"
157 "5.2"
158 "5.3"
159 "6.0"
160 "6.1"
161 "6.2"
162 "7.0"
163 "7.2"
164 "7.5"
165 "8.0"
166 "8.6"
167 "8.7"
168 "8.9"
169 "9.0"
170 "9.0a"
171 "10.0"
172 ];
173 ptx = lists.map (x: "${x}+PTX") real;
174 in
175 real ++ ptx;
176
177 # NOTE: The lists.subtractLists function is perhaps a bit unintuitive. It subtracts the elements
178 # of the first list *from* the second list. That means:
179 # lists.subtractLists a b = b - a
180
181 # For CUDA
182 supportedCudaCapabilities = lists.intersectLists flags.cudaCapabilities supportedTorchCudaCapabilities;
183 unsupportedCudaCapabilities = lists.subtractLists supportedCudaCapabilities flags.cudaCapabilities;
184
185 isCudaJetson = cudaSupport && cudaPackages.flags.isJetsonBuild;
186
187 # Use trivial.warnIf to print a warning if any unsupported GPU targets are specified.
188 gpuArchWarner =
189 supported: unsupported:
190 trivial.throwIf (supported == [ ]) (
191 "No supported GPU targets specified. Requested GPU targets: "
192 + strings.concatStringsSep ", " unsupported
193 ) supported;
194
195 # Create the gpuTargetString.
196 gpuTargetString = strings.concatStringsSep ";" (
197 if gpuTargets != [ ] then
198 # If gpuTargets is specified, it always takes priority.
199 gpuTargets
200 else if cudaSupport then
201 gpuArchWarner supportedCudaCapabilities unsupportedCudaCapabilities
202 else if rocmSupport then
203 rocmPackages.clr.gpuTargets
204 else
205 throw "No GPU targets specified"
206 );
207
208 mergedCudaLibraries = with cudaPackages; [
209 cuda_cudart # cuda_runtime.h, -lcudart
210 cuda_cccl
211 libcusparse # cusparse.h
212 libcusolver # cusolverDn.h
213 cuda_nvtx
214 cuda_nvrtc
215 libcublas
216 ];
217
218 # Some packages are not available on all platforms
219 nccl = shouldUsePkg (cudaPackages.nccl or null);
220
221 getAllOutputs = p: [
222 (lib.getBin p)
223 (lib.getLib p)
224 (lib.getDev p)
225 ];
226
227in
228
229buildPythonPackage rec {
230 pname = "vllm";
231 version = "0.8.3";
232 pyproject = true;
233
234 stdenv = torch.stdenv;
235
236 src = fetchFromGitHub {
237 owner = "vllm-project";
238 repo = pname;
239 tag = "v${version}";
240 hash = "sha256-LiEBkVwJTT4WoCTk9pI0ykTjmv1pDMzksmFwVktoxMY=";
241 };
242
243 patches = [
244 ./0002-setup.py-nix-support-respect-cmakeFlags.patch
245 ./0003-propagate-pythonpath.patch
246 ./0004-drop-lsmod.patch
247 ];
248
249 # Ignore the python version check because it hard-codes minor versions and
250 # lags behind `ray`'s python interpreter support
251 postPatch =
252 ''
253 substituteInPlace CMakeLists.txt \
254 --replace-fail \
255 'set(PYTHON_SUPPORTED_VERSIONS' \
256 'set(PYTHON_SUPPORTED_VERSIONS "${lib.versions.majorMinor python.version}"'
257 ''
258 + lib.optionalString (nccl == null) ''
259 # On platforms where NCCL is not supported (e.g. Jetson), substitute Gloo (provided by Torch)
260 substituteInPlace vllm/distributed/parallel_state.py \
261 --replace-fail '"nccl"' '"gloo"'
262 '';
263
264 nativeBuildInputs =
265 [
266 cmake
267 ninja
268 pythonRelaxDepsHook
269 which
270 ]
271 ++ lib.optionals rocmSupport [
272 rocmPackages.hipcc
273 ]
274 ++ lib.optionals cudaSupport [
275 cudaPackages.cuda_nvcc
276 autoAddDriverRunpath
277 ]
278 ++ lib.optionals isCudaJetson [
279 cudaPackages.autoAddCudaCompatRunpath
280 ];
281
282 build-system = [
283 packaging
284 setuptools
285 wheel
286 ];
287
288 buildInputs =
289 [
290 setuptools-scm
291 torch
292 ]
293 ++ lib.optionals cpuSupport [
294 oneDNN
295 ]
296 ++ lib.optionals (cpuSupport && stdenv.isLinux) [
297 numactl
298 ]
299 ++ lib.optionals cudaSupport (
300 mergedCudaLibraries
301 ++ (with cudaPackages; [
302 nccl
303 cudnn
304 libcufile
305 ])
306 )
307 ++ lib.optionals rocmSupport (
308 with rocmPackages;
309 [
310 clr
311 rocthrust
312 rocprim
313 hipsparse
314 hipblas
315 ]
316 )
317 ++ lib.optionals stdenv.cc.isClang [
318 llvmPackages.openmp
319 ];
320
321 dependencies =
322 [
323 aioprometheus
324 blake3
325 cachetools
326 depyf
327 fastapi
328 llguidance
329 lm-format-enforcer
330 numpy
331 openai
332 opencv-python-headless
333 outlines
334 pandas
335 prometheus-fastapi-instrumentator
336 psutil
337 py-cpuinfo
338 pyarrow
339 pydantic
340 python-json-logger
341 python-multipart
342 pyzmq
343 ray
344 sentencepiece
345 tiktoken
346 tokenizers
347 msgspec
348 gguf
349 einops
350 importlib-metadata
351 partial-json-parser
352 compressed-tensors
353 mistral-common
354 torch
355 torchaudio
356 torchvision
357 transformers
358 uvicorn
359 xformers
360 xgrammar
361 numba
362 ]
363 ++ uvicorn.optional-dependencies.standard
364 ++ aioprometheus.optional-dependencies.starlette
365 ++ lib.optionals cudaSupport [
366 cupy
367 pynvml
368 ];
369
370 dontUseCmakeConfigure = true;
371 cmakeFlags =
372 [
373 (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_CUTLASS" "${lib.getDev cutlass}")
374 (lib.cmakeFeature "FLASH_MLA_SRC_DIR" "${lib.getDev flashmla}")
375 (lib.cmakeFeature "VLLM_FLASH_ATTN_SRC_DIR" "${lib.getDev vllm-flash-attn}")
376 ]
377 ++ lib.optionals cudaSupport [
378 (lib.cmakeFeature "TORCH_CUDA_ARCH_LIST" "${gpuTargetString}")
379 (lib.cmakeFeature "CUTLASS_NVCC_ARCHS_ENABLED" "${cudaPackages.flags.cmakeCudaArchitecturesString}")
380 (lib.cmakeFeature "CUDA_TOOLKIT_ROOT_DIR" "${symlinkJoin {
381 name = "cuda-merged-${cudaPackages.cudaMajorMinorVersion}";
382 paths = builtins.concatMap getAllOutputs mergedCudaLibraries;
383 }}")
384 (lib.cmakeFeature "CAFFE2_USE_CUDNN" "ON")
385 (lib.cmakeFeature "CAFFE2_USE_CUFILE" "ON")
386 (lib.cmakeFeature "CUTLASS_ENABLE_CUBLAS" "ON")
387 ]
388 ++ lib.optionals cpuSupport [
389 (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_ONEDNN" "${lib.getDev oneDNN}")
390 ];
391
392 env =
393 lib.optionalAttrs cudaSupport {
394 VLLM_TARGET_DEVICE = "cuda";
395 CUDA_HOME = "${lib.getDev cudaPackages.cuda_nvcc}";
396 }
397 // lib.optionalAttrs rocmSupport {
398 VLLM_TARGET_DEVICE = "rocm";
399 # Otherwise it tries to enumerate host supported ROCM gfx archs, and that is not possible due to sandboxing.
400 PYTORCH_ROCM_ARCH = lib.strings.concatStringsSep ";" rocmPackages.clr.gpuTargets;
401 ROCM_HOME = "${rocmPackages.clr}";
402 }
403 // lib.optionalAttrs cpuSupport {
404 VLLM_TARGET_DEVICE = "cpu";
405 };
406
407 preConfigure = ''
408 # See: https://github.com/vllm-project/vllm/blob/v0.7.1/setup.py#L75-L109
409 # There's also NVCC_THREADS but Nix/Nixpkgs doesn't really have this concept.
410 export MAX_JOBS="$NIX_BUILD_CORES"
411 '';
412
413 pythonRelaxDeps = true;
414
415 pythonImportsCheck = [ "vllm" ];
416
417 # updates the cutlass fetcher instead
418 passthru.skipBulkUpdate = true;
419
420 meta = with lib; {
421 description = "High-throughput and memory-efficient inference and serving engine for LLMs";
422 changelog = "https://github.com/vllm-project/vllm/releases/tag/v${version}";
423 homepage = "https://github.com/vllm-project/vllm";
424 license = licenses.asl20;
425 maintainers = with maintainers; [
426 happysalada
427 lach
428 ];
429 badPlatforms = [
430 # CMake Error at cmake/cpu_extension.cmake:78 (find_isa):
431 # find_isa Function invoked with incorrect arguments for function named:
432 # find_isa
433 "x86_64-darwin"
434 ];
435 };
436}