at 25.11-pre 11 kB view raw
1{ 2 lib, 3 stdenv, 4 python, 5 buildPythonPackage, 6 pythonRelaxDepsHook, 7 fetchFromGitHub, 8 symlinkJoin, 9 autoAddDriverRunpath, 10 11 # build system 12 packaging, 13 setuptools, 14 wheel, 15 16 # dependencies 17 which, 18 ninja, 19 cmake, 20 setuptools-scm, 21 torch, 22 outlines, 23 psutil, 24 ray, 25 pandas, 26 pyarrow, 27 sentencepiece, 28 numpy, 29 transformers, 30 xformers, 31 xgrammar, 32 numba, 33 fastapi, 34 uvicorn, 35 pydantic, 36 aioprometheus, 37 pynvml, 38 openai, 39 pyzmq, 40 tiktoken, 41 torchaudio, 42 torchvision, 43 py-cpuinfo, 44 lm-format-enforcer, 45 prometheus-fastapi-instrumentator, 46 cupy, 47 gguf, 48 einops, 49 importlib-metadata, 50 partial-json-parser, 51 compressed-tensors, 52 mistral-common, 53 msgspec, 54 numactl, 55 tokenizers, 56 oneDNN, 57 blake3, 58 depyf, 59 opencv-python-headless, 60 cachetools, 61 llguidance, 62 python-json-logger, 63 python-multipart, 64 llvmPackages, 65 66 cudaSupport ? torch.cudaSupport, 67 cudaPackages ? { }, 68 rocmSupport ? torch.rocmSupport, 69 rocmPackages ? { }, 70 gpuTargets ? [ ], 71}: 72 73let 74 inherit (lib) 75 lists 76 strings 77 trivial 78 ; 79 80 inherit (cudaPackages) flags; 81 82 shouldUsePkg = 83 pkg: if pkg != null && lib.meta.availableOn stdenv.hostPlatform pkg then pkg else null; 84 85 # see CMakeLists.txt, grepping for GIT_TAG near cutlass 86 # https://github.com/vllm-project/vllm/blob/v${version}/CMakeLists.txt 87 cutlass = fetchFromGitHub { 88 owner = "NVIDIA"; 89 repo = "cutlass"; 90 tag = "v3.8.0"; 91 hash = "sha256-oIzlbKRdOh6gp6nRZ8udLSqleBFoFtgM7liCBlHZLOk="; 92 }; 93 94 flashmla = stdenv.mkDerivation { 95 pname = "flashmla"; 96 # https://github.com/vllm-project/FlashMLA/blob/${src.rev}/setup.py 97 version = "1.0.0"; 98 99 # grep for GIT_TAG in the following file 100 # https://github.com/vllm-project/vllm/blob/v${version}/cmake/external_projects/flashmla.cmake 101 src = fetchFromGitHub { 102 owner = "vllm-project"; 103 repo = "FlashMLA"; 104 rev = "575f7724b9762f265bbee5889df9c7d630801845"; 105 hash = "sha256-8WrKMl0olr0nYV4FRJfwSaJ0F5gWQpssoFMjr9tbHBk="; 106 }; 107 108 dontConfigure = true; 109 110 # flashmla normally relies on `git submodule update` to fetch cutlass 111 buildPhase = '' 112 rm -rf csrc/cutlass 113 ln -sf ${cutlass} csrc/cutlass 114 ''; 115 116 installPhase = '' 117 cp -rva . $out 118 ''; 119 }; 120 121 vllm-flash-attn = stdenv.mkDerivation { 122 pname = "vllm-flash-attn"; 123 # https://github.com/vllm-project/flash-attention/blob/${src.rev}/vllm_flash_attn/__init__.py 124 version = "2.7.2.post1"; 125 126 # grep for GIT_TAG in the following file 127 # https://github.com/vllm-project/vllm/blob/v${version}/cmake/external_projects/vllm_flash_attn.cmake 128 src = fetchFromGitHub { 129 owner = "vllm-project"; 130 repo = "flash-attention"; 131 rev = "dc9d410b3e2d6534a4c70724c2515f4def670a22"; 132 hash = "sha256-ZQ0bOBIb+8IMmya8dmimKQ17KTBplX81IirdnBJpX5M="; 133 }; 134 135 dontConfigure = true; 136 137 # vllm-flash-attn normally relies on `git submodule update` to fetch cutlass 138 buildPhase = '' 139 rm -rf csrc/cutlass 140 ln -sf ${cutlass} csrc/cutlass 141 ''; 142 143 installPhase = '' 144 cp -rva . $out 145 ''; 146 }; 147 148 cpuSupport = !cudaSupport && !rocmSupport; 149 150 # https://github.com/pytorch/pytorch/blob/v2.6.0/torch/utils/cpp_extension.py#L2046-L2048 151 supportedTorchCudaCapabilities = 152 let 153 real = [ 154 "3.5" 155 "3.7" 156 "5.0" 157 "5.2" 158 "5.3" 159 "6.0" 160 "6.1" 161 "6.2" 162 "7.0" 163 "7.2" 164 "7.5" 165 "8.0" 166 "8.6" 167 "8.7" 168 "8.9" 169 "9.0" 170 "9.0a" 171 "10.0" 172 ]; 173 ptx = lists.map (x: "${x}+PTX") real; 174 in 175 real ++ ptx; 176 177 # NOTE: The lists.subtractLists function is perhaps a bit unintuitive. It subtracts the elements 178 # of the first list *from* the second list. That means: 179 # lists.subtractLists a b = b - a 180 181 # For CUDA 182 supportedCudaCapabilities = lists.intersectLists flags.cudaCapabilities supportedTorchCudaCapabilities; 183 unsupportedCudaCapabilities = lists.subtractLists supportedCudaCapabilities flags.cudaCapabilities; 184 185 isCudaJetson = cudaSupport && cudaPackages.flags.isJetsonBuild; 186 187 # Use trivial.warnIf to print a warning if any unsupported GPU targets are specified. 188 gpuArchWarner = 189 supported: unsupported: 190 trivial.throwIf (supported == [ ]) ( 191 "No supported GPU targets specified. Requested GPU targets: " 192 + strings.concatStringsSep ", " unsupported 193 ) supported; 194 195 # Create the gpuTargetString. 196 gpuTargetString = strings.concatStringsSep ";" ( 197 if gpuTargets != [ ] then 198 # If gpuTargets is specified, it always takes priority. 199 gpuTargets 200 else if cudaSupport then 201 gpuArchWarner supportedCudaCapabilities unsupportedCudaCapabilities 202 else if rocmSupport then 203 rocmPackages.clr.gpuTargets 204 else 205 throw "No GPU targets specified" 206 ); 207 208 mergedCudaLibraries = with cudaPackages; [ 209 cuda_cudart # cuda_runtime.h, -lcudart 210 cuda_cccl 211 libcusparse # cusparse.h 212 libcusolver # cusolverDn.h 213 cuda_nvtx 214 cuda_nvrtc 215 libcublas 216 ]; 217 218 # Some packages are not available on all platforms 219 nccl = shouldUsePkg (cudaPackages.nccl or null); 220 221 getAllOutputs = p: [ 222 (lib.getBin p) 223 (lib.getLib p) 224 (lib.getDev p) 225 ]; 226 227in 228 229buildPythonPackage rec { 230 pname = "vllm"; 231 version = "0.8.3"; 232 pyproject = true; 233 234 stdenv = torch.stdenv; 235 236 src = fetchFromGitHub { 237 owner = "vllm-project"; 238 repo = pname; 239 tag = "v${version}"; 240 hash = "sha256-LiEBkVwJTT4WoCTk9pI0ykTjmv1pDMzksmFwVktoxMY="; 241 }; 242 243 patches = [ 244 ./0002-setup.py-nix-support-respect-cmakeFlags.patch 245 ./0003-propagate-pythonpath.patch 246 ./0004-drop-lsmod.patch 247 ]; 248 249 # Ignore the python version check because it hard-codes minor versions and 250 # lags behind `ray`'s python interpreter support 251 postPatch = 252 '' 253 substituteInPlace CMakeLists.txt \ 254 --replace-fail \ 255 'set(PYTHON_SUPPORTED_VERSIONS' \ 256 'set(PYTHON_SUPPORTED_VERSIONS "${lib.versions.majorMinor python.version}"' 257 '' 258 + lib.optionalString (nccl == null) '' 259 # On platforms where NCCL is not supported (e.g. Jetson), substitute Gloo (provided by Torch) 260 substituteInPlace vllm/distributed/parallel_state.py \ 261 --replace-fail '"nccl"' '"gloo"' 262 ''; 263 264 nativeBuildInputs = 265 [ 266 cmake 267 ninja 268 pythonRelaxDepsHook 269 which 270 ] 271 ++ lib.optionals rocmSupport [ 272 rocmPackages.hipcc 273 ] 274 ++ lib.optionals cudaSupport [ 275 cudaPackages.cuda_nvcc 276 autoAddDriverRunpath 277 ] 278 ++ lib.optionals isCudaJetson [ 279 cudaPackages.autoAddCudaCompatRunpath 280 ]; 281 282 build-system = [ 283 packaging 284 setuptools 285 wheel 286 ]; 287 288 buildInputs = 289 [ 290 setuptools-scm 291 torch 292 ] 293 ++ lib.optionals cpuSupport [ 294 oneDNN 295 ] 296 ++ lib.optionals (cpuSupport && stdenv.isLinux) [ 297 numactl 298 ] 299 ++ lib.optionals cudaSupport ( 300 mergedCudaLibraries 301 ++ (with cudaPackages; [ 302 nccl 303 cudnn 304 libcufile 305 ]) 306 ) 307 ++ lib.optionals rocmSupport ( 308 with rocmPackages; 309 [ 310 clr 311 rocthrust 312 rocprim 313 hipsparse 314 hipblas 315 ] 316 ) 317 ++ lib.optionals stdenv.cc.isClang [ 318 llvmPackages.openmp 319 ]; 320 321 dependencies = 322 [ 323 aioprometheus 324 blake3 325 cachetools 326 depyf 327 fastapi 328 llguidance 329 lm-format-enforcer 330 numpy 331 openai 332 opencv-python-headless 333 outlines 334 pandas 335 prometheus-fastapi-instrumentator 336 psutil 337 py-cpuinfo 338 pyarrow 339 pydantic 340 python-json-logger 341 python-multipart 342 pyzmq 343 ray 344 sentencepiece 345 tiktoken 346 tokenizers 347 msgspec 348 gguf 349 einops 350 importlib-metadata 351 partial-json-parser 352 compressed-tensors 353 mistral-common 354 torch 355 torchaudio 356 torchvision 357 transformers 358 uvicorn 359 xformers 360 xgrammar 361 numba 362 ] 363 ++ uvicorn.optional-dependencies.standard 364 ++ aioprometheus.optional-dependencies.starlette 365 ++ lib.optionals cudaSupport [ 366 cupy 367 pynvml 368 ]; 369 370 dontUseCmakeConfigure = true; 371 cmakeFlags = 372 [ 373 (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_CUTLASS" "${lib.getDev cutlass}") 374 (lib.cmakeFeature "FLASH_MLA_SRC_DIR" "${lib.getDev flashmla}") 375 (lib.cmakeFeature "VLLM_FLASH_ATTN_SRC_DIR" "${lib.getDev vllm-flash-attn}") 376 ] 377 ++ lib.optionals cudaSupport [ 378 (lib.cmakeFeature "TORCH_CUDA_ARCH_LIST" "${gpuTargetString}") 379 (lib.cmakeFeature "CUTLASS_NVCC_ARCHS_ENABLED" "${cudaPackages.flags.cmakeCudaArchitecturesString}") 380 (lib.cmakeFeature "CUDA_TOOLKIT_ROOT_DIR" "${symlinkJoin { 381 name = "cuda-merged-${cudaPackages.cudaMajorMinorVersion}"; 382 paths = builtins.concatMap getAllOutputs mergedCudaLibraries; 383 }}") 384 (lib.cmakeFeature "CAFFE2_USE_CUDNN" "ON") 385 (lib.cmakeFeature "CAFFE2_USE_CUFILE" "ON") 386 (lib.cmakeFeature "CUTLASS_ENABLE_CUBLAS" "ON") 387 ] 388 ++ lib.optionals cpuSupport [ 389 (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_ONEDNN" "${lib.getDev oneDNN}") 390 ]; 391 392 env = 393 lib.optionalAttrs cudaSupport { 394 VLLM_TARGET_DEVICE = "cuda"; 395 CUDA_HOME = "${lib.getDev cudaPackages.cuda_nvcc}"; 396 } 397 // lib.optionalAttrs rocmSupport { 398 VLLM_TARGET_DEVICE = "rocm"; 399 # Otherwise it tries to enumerate host supported ROCM gfx archs, and that is not possible due to sandboxing. 400 PYTORCH_ROCM_ARCH = lib.strings.concatStringsSep ";" rocmPackages.clr.gpuTargets; 401 ROCM_HOME = "${rocmPackages.clr}"; 402 } 403 // lib.optionalAttrs cpuSupport { 404 VLLM_TARGET_DEVICE = "cpu"; 405 }; 406 407 preConfigure = '' 408 # See: https://github.com/vllm-project/vllm/blob/v0.7.1/setup.py#L75-L109 409 # There's also NVCC_THREADS but Nix/Nixpkgs doesn't really have this concept. 410 export MAX_JOBS="$NIX_BUILD_CORES" 411 ''; 412 413 pythonRelaxDeps = true; 414 415 pythonImportsCheck = [ "vllm" ]; 416 417 # updates the cutlass fetcher instead 418 passthru.skipBulkUpdate = true; 419 420 meta = with lib; { 421 description = "High-throughput and memory-efficient inference and serving engine for LLMs"; 422 changelog = "https://github.com/vllm-project/vllm/releases/tag/v${version}"; 423 homepage = "https://github.com/vllm-project/vllm"; 424 license = licenses.asl20; 425 maintainers = with maintainers; [ 426 happysalada 427 lach 428 ]; 429 badPlatforms = [ 430 # CMake Error at cmake/cpu_extension.cmake:78 (find_isa): 431 # find_isa Function invoked with incorrect arguments for function named: 432 # find_isa 433 "x86_64-darwin" 434 ]; 435 }; 436}