1{
2 lib,
3 stdenv,
4 python,
5 buildPythonPackage,
6 pythonRelaxDepsHook,
7 fetchFromGitHub,
8 which,
9 ninja,
10 cmake,
11 packaging,
12 setuptools,
13 torch,
14 outlines,
15 wheel,
16 psutil,
17 ray,
18 pandas,
19 pyarrow,
20 sentencepiece,
21 numpy,
22 transformers,
23 xformers,
24 fastapi,
25 uvicorn,
26 pydantic,
27 aioprometheus,
28 pynvml,
29 openai,
30 pyzmq,
31 tiktoken,
32 torchvision,
33 py-cpuinfo,
34 lm-format-enforcer,
35 prometheus-fastapi-instrumentator,
36 cupy,
37 writeShellScript,
38
39 config,
40
41 cudaSupport ? config.cudaSupport,
42 cudaPackages ? { },
43
44 # Has to be either rocm or cuda, default to the free one
45 rocmSupport ? !config.cudaSupport,
46 rocmPackages ? { },
47 gpuTargets ? [ ],
48}@args:
49
50let
51 cutlass = fetchFromGitHub {
52 owner = "NVIDIA";
53 repo = "cutlass";
54 rev = "refs/tags/v3.5.0";
55 sha256 = "sha256-D/s7eYsa5l/mfx73tE4mnFcTQdYqGmXa9d9TCryw4e4=";
56 };
57in
58
59buildPythonPackage rec {
60 pname = "vllm";
61 version = "0.6.2";
62 pyproject = true;
63
64 stdenv = if cudaSupport then cudaPackages.backendStdenv else args.stdenv;
65
66 src = fetchFromGitHub {
67 owner = "vllm-project";
68 repo = pname;
69 rev = "refs/tags/v${version}";
70 hash = "sha256-zUkqAPPhDRdN9rDQ2biCl1B+trV0xIHXub++v9zsQGo=";
71 };
72
73 patches = [
74 ./0001-setup.py-don-t-ask-for-hipcc-version.patch
75 ./0002-setup.py-nix-support-respect-cmakeFlags.patch
76 ];
77
78 # Ignore the python version check because it hard-codes minor versions and
79 # lags behind `ray`'s python interpreter support
80 postPatch = ''
81 substituteInPlace CMakeLists.txt \
82 --replace-fail \
83 'set(PYTHON_SUPPORTED_VERSIONS' \
84 'set(PYTHON_SUPPORTED_VERSIONS "${lib.versions.majorMinor python.version}"'
85 '';
86
87 nativeBuildInputs = [
88 cmake
89 ninja
90 pythonRelaxDepsHook
91 which
92 ] ++ lib.optionals rocmSupport [ rocmPackages.hipcc ];
93
94 build-system = [
95 packaging
96 setuptools
97 wheel
98 ];
99
100 buildInputs =
101 (lib.optionals cudaSupport (
102 with cudaPackages;
103 [
104 cuda_cudart # cuda_runtime.h, -lcudart
105 cuda_cccl
106 libcusparse # cusparse.h
107 libcusolver # cusolverDn.h
108 cuda_nvcc
109 cuda_nvtx
110 libcublas
111 ]
112 ))
113 ++ (lib.optionals rocmSupport (
114 with rocmPackages;
115 [
116 clr
117 rocthrust
118 rocprim
119 hipsparse
120 hipblas
121 ]
122 ));
123
124 dependencies =
125 [
126 aioprometheus
127 fastapi
128 lm-format-enforcer
129 numpy
130 openai
131 outlines
132 pandas
133 prometheus-fastapi-instrumentator
134 psutil
135 py-cpuinfo
136 pyarrow
137 pydantic
138 pyzmq
139 ray
140 sentencepiece
141 tiktoken
142 torch
143 torchvision
144 transformers
145 uvicorn
146 xformers
147 ]
148 ++ uvicorn.optional-dependencies.standard
149 ++ aioprometheus.optional-dependencies.starlette
150 ++ lib.optionals cudaSupport [
151 cupy
152 pynvml
153 ];
154
155 dontUseCmakeConfigure = true;
156 cmakeFlags = [ (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_CUTLASS" "${lib.getDev cutlass}") ];
157
158 env =
159 lib.optionalAttrs cudaSupport { CUDA_HOME = "${lib.getDev cudaPackages.cuda_nvcc}"; }
160 // lib.optionalAttrs rocmSupport {
161 # Otherwise it tries to enumerate host supported ROCM gfx archs, and that is not possible due to sandboxing.
162 PYTORCH_ROCM_ARCH = lib.strings.concatStringsSep ";" rocmPackages.clr.gpuTargets;
163 ROCM_HOME = "${rocmPackages.clr}";
164 };
165
166 pythonRelaxDeps = true;
167
168 pythonImportsCheck = [ "vllm" ];
169
170 meta = with lib; {
171 description = "High-throughput and memory-efficient inference and serving engine for LLMs";
172 changelog = "https://github.com/vllm-project/vllm/releases/tag/v${version}";
173 homepage = "https://github.com/vllm-project/vllm";
174 license = licenses.asl20;
175 maintainers = with maintainers; [
176 happysalada
177 lach
178 ];
179 # RuntimeError: Unknown runtime environment
180 broken = true;
181 # broken = !cudaSupport && !rocmSupport;
182 };
183}