Clone of https://github.com/NixOS/nixpkgs.git (to stress-test knotserver)
1{ 2 lib, 3 stdenv, 4 python, 5 buildPythonPackage, 6 pythonAtLeast, 7 fetchFromGitHub, 8 fetchpatch, 9 symlinkJoin, 10 autoAddDriverRunpath, 11 12 # build system 13 cmake, 14 jinja2, 15 ninja, 16 packaging, 17 setuptools, 18 setuptools-scm, 19 20 # dependencies 21 which, 22 torch, 23 outlines, 24 psutil, 25 ray, 26 pandas, 27 pyarrow, 28 sentencepiece, 29 numpy, 30 transformers, 31 xformers, 32 xgrammar, 33 numba, 34 fastapi, 35 uvicorn, 36 pydantic, 37 aioprometheus, 38 pynvml, 39 openai, 40 pyzmq, 41 tiktoken, 42 torchaudio, 43 torchvision, 44 py-cpuinfo, 45 lm-format-enforcer, 46 prometheus-fastapi-instrumentator, 47 cupy, 48 gguf, 49 einops, 50 importlib-metadata, 51 partial-json-parser, 52 compressed-tensors, 53 mistral-common, 54 msgspec, 55 numactl, 56 tokenizers, 57 oneDNN, 58 blake3, 59 depyf, 60 opencv-python-headless, 61 cachetools, 62 llguidance, 63 python-json-logger, 64 python-multipart, 65 llvmPackages, 66 opentelemetry-sdk, 67 opentelemetry-api, 68 opentelemetry-exporter-otlp, 69 bitsandbytes, 70 flashinfer, 71 py-libnuma, 72 73 # internal dependency - for overriding in overlays 74 vllm-flash-attn ? null, 75 76 cudaSupport ? torch.cudaSupport, 77 cudaPackages ? { }, 78 rocmSupport ? torch.rocmSupport, 79 rocmPackages ? { }, 80 gpuTargets ? [ ], 81}: 82 83let 84 inherit (lib) 85 lists 86 strings 87 trivial 88 ; 89 90 inherit (cudaPackages) flags; 91 92 shouldUsePkg = 93 pkg: if pkg != null && lib.meta.availableOn stdenv.hostPlatform pkg then pkg else null; 94 95 # see CMakeLists.txt, grepping for GIT_TAG near cutlass 96 # https://github.com/vllm-project/vllm/blob/v${version}/CMakeLists.txt 97 cutlass = fetchFromGitHub { 98 owner = "NVIDIA"; 99 repo = "cutlass"; 100 tag = "v3.9.2"; 101 hash = "sha256-teziPNA9csYvhkG5t2ht8W8x5+1YGGbHm8VKx4JoxgI="; 102 }; 103 104 flashmla = stdenv.mkDerivation { 105 pname = "flashmla"; 106 # https://github.com/vllm-project/FlashMLA/blob/${src.rev}/setup.py 107 version = "1.0.0"; 108 109 # grep for GIT_TAG in the following file 110 # https://github.com/vllm-project/vllm/blob/v${version}/cmake/external_projects/flashmla.cmake 111 src = fetchFromGitHub { 112 owner = "vllm-project"; 113 repo = "FlashMLA"; 114 rev = "575f7724b9762f265bbee5889df9c7d630801845"; 115 hash = "sha256-8WrKMl0olr0nYV4FRJfwSaJ0F5gWQpssoFMjr9tbHBk="; 116 }; 117 118 dontConfigure = true; 119 120 # flashmla normally relies on `git submodule update` to fetch cutlass 121 buildPhase = '' 122 rm -rf csrc/cutlass 123 ln -sf ${cutlass} csrc/cutlass 124 ''; 125 126 installPhase = '' 127 cp -rva . $out 128 ''; 129 }; 130 131 vllm-flash-attn' = lib.defaultTo (stdenv.mkDerivation { 132 pname = "vllm-flash-attn"; 133 # https://github.com/vllm-project/flash-attention/blob/${src.rev}/vllm_flash_attn/__init__.py 134 version = "2.7.4.post1"; 135 136 # grep for GIT_TAG in the following file 137 # https://github.com/vllm-project/vllm/blob/v${version}/cmake/external_projects/vllm_flash_attn.cmake 138 src = fetchFromGitHub { 139 owner = "vllm-project"; 140 repo = "flash-attention"; 141 rev = "8798f27777fb57f447070301bf33a9f9c607f491"; 142 hash = "sha256-UTUvATGN1NU/Bc8qo078q6bEgILLmlrjL7Yk2iAJhg4="; 143 }; 144 145 dontConfigure = true; 146 147 # vllm-flash-attn normally relies on `git submodule update` to fetch cutlass and composable_kernel 148 buildPhase = '' 149 rm -rf csrc/cutlass 150 ln -sf ${cutlass} csrc/cutlass 151 '' 152 + lib.optionalString (rocmSupport) '' 153 rm -rf csrc/composable_kernel; 154 ln -sf ${rocmPackages.composable_kernel} csrc/composable_kernel 155 ''; 156 157 installPhase = '' 158 cp -rva . $out 159 ''; 160 }) vllm-flash-attn; 161 162 cpuSupport = !cudaSupport && !rocmSupport; 163 164 # https://github.com/pytorch/pytorch/blob/v2.7.0/torch/utils/cpp_extension.py#L2343-L2345 165 supportedTorchCudaCapabilities = 166 let 167 real = [ 168 "3.5" 169 "3.7" 170 "5.0" 171 "5.2" 172 "5.3" 173 "6.0" 174 "6.1" 175 "6.2" 176 "7.0" 177 "7.2" 178 "7.5" 179 "8.0" 180 "8.6" 181 "8.7" 182 "8.9" 183 "9.0" 184 "9.0a" 185 "10.0" 186 "10.0a" 187 "10.1" 188 "10.1a" 189 "12.0" 190 "12.0a" 191 ]; 192 ptx = lists.map (x: "${x}+PTX") real; 193 in 194 real ++ ptx; 195 196 # NOTE: The lists.subtractLists function is perhaps a bit unintuitive. It subtracts the elements 197 # of the first list *from* the second list. That means: 198 # lists.subtractLists a b = b - a 199 200 # For CUDA 201 supportedCudaCapabilities = lists.intersectLists flags.cudaCapabilities supportedTorchCudaCapabilities; 202 unsupportedCudaCapabilities = lists.subtractLists supportedCudaCapabilities flags.cudaCapabilities; 203 204 isCudaJetson = cudaSupport && cudaPackages.flags.isJetsonBuild; 205 206 # Use trivial.warnIf to print a warning if any unsupported GPU targets are specified. 207 gpuArchWarner = 208 supported: unsupported: 209 trivial.throwIf (supported == [ ]) ( 210 "No supported GPU targets specified. Requested GPU targets: " 211 + strings.concatStringsSep ", " unsupported 212 ) supported; 213 214 # Create the gpuTargetString. 215 gpuTargetString = strings.concatStringsSep ";" ( 216 if gpuTargets != [ ] then 217 # If gpuTargets is specified, it always takes priority. 218 gpuTargets 219 else if cudaSupport then 220 gpuArchWarner supportedCudaCapabilities unsupportedCudaCapabilities 221 else if rocmSupport then 222 rocmPackages.clr.gpuTargets 223 else 224 throw "No GPU targets specified" 225 ); 226 227 mergedCudaLibraries = with cudaPackages; [ 228 cuda_cudart # cuda_runtime.h, -lcudart 229 cuda_cccl 230 libcusparse # cusparse.h 231 libcusolver # cusolverDn.h 232 cuda_nvtx 233 cuda_nvrtc 234 libcublas 235 ]; 236 237 # Some packages are not available on all platforms 238 nccl = shouldUsePkg (cudaPackages.nccl or null); 239 240 getAllOutputs = p: [ 241 (lib.getBin p) 242 (lib.getLib p) 243 (lib.getDev p) 244 ]; 245 246in 247 248buildPythonPackage rec { 249 pname = "vllm"; 250 version = "0.9.1"; 251 pyproject = true; 252 253 # https://github.com/vllm-project/vllm/issues/12083 254 disabled = pythonAtLeast "3.13"; 255 256 stdenv = torch.stdenv; 257 258 src = fetchFromGitHub { 259 owner = "vllm-project"; 260 repo = "vllm"; 261 tag = "v${version}"; 262 hash = "sha256-sp7rDpewTPXTVRBJHJMj+8pJDS6wAu0/OTJZwbPPqKc="; 263 }; 264 265 patches = [ 266 (fetchpatch { 267 name = "remove-unused-opentelemetry-semantic-conventions-ai-dep.patch"; 268 url = "https://github.com/vllm-project/vllm/commit/6a5d7e45f52c3a13de43b8b4fa9033e3b342ebd2.patch"; 269 hash = "sha256-KYthqu+6XwsYYd80PtfrMMjuRV9+ionccr7EbjE4jJE="; 270 }) 271 (fetchpatch { 272 name = "fall-back-to-gloo-when-nccl-unavailable.patch"; 273 url = "https://github.com/vllm-project/vllm/commit/aa131a94410683b0a02e74fed2ce95e6c2b6b030.patch"; 274 hash = "sha256-jNlQZQ8xiW85JWyBjsPZ6FoRQsiG1J8bwzmQjnaWFBg="; 275 }) 276 ./0002-setup.py-nix-support-respect-cmakeFlags.patch 277 ./0003-propagate-pythonpath.patch 278 ./0004-drop-lsmod.patch 279 ./0005-drop-intel-reqs.patch 280 ]; 281 282 postPatch = '' 283 # pythonRelaxDeps does not cover build-system 284 substituteInPlace pyproject.toml \ 285 --replace-fail "torch ==" "torch >=" \ 286 --replace-fail "setuptools>=77.0.3,<80.0.0" "setuptools" 287 288 # Ignore the python version check because it hard-codes minor versions and 289 # lags behind `ray`'s python interpreter support 290 substituteInPlace CMakeLists.txt \ 291 --replace-fail \ 292 'set(PYTHON_SUPPORTED_VERSIONS' \ 293 'set(PYTHON_SUPPORTED_VERSIONS "${lib.versions.majorMinor python.version}"' 294 295 # Pass build environment PYTHONPATH to vLLM's Python configuration scripts 296 substituteInPlace CMakeLists.txt \ 297 --replace-fail '$PYTHONPATH' '$ENV{PYTHONPATH}' 298 ''; 299 300 nativeBuildInputs = [ 301 which 302 ] 303 ++ lib.optionals rocmSupport [ 304 rocmPackages.hipcc 305 ] 306 ++ lib.optionals cudaSupport [ 307 cudaPackages.cuda_nvcc 308 autoAddDriverRunpath 309 ] 310 ++ lib.optionals isCudaJetson [ 311 cudaPackages.autoAddCudaCompatRunpath 312 ]; 313 314 build-system = [ 315 cmake 316 jinja2 317 ninja 318 packaging 319 setuptools 320 setuptools-scm 321 torch 322 ]; 323 324 buildInputs = 325 lib.optionals cpuSupport [ 326 oneDNN 327 ] 328 ++ lib.optionals (cpuSupport && stdenv.hostPlatform.isLinux) [ 329 numactl 330 ] 331 ++ lib.optionals cudaSupport ( 332 mergedCudaLibraries 333 ++ (with cudaPackages; [ 334 nccl 335 cudnn 336 libcufile 337 ]) 338 ) 339 ++ lib.optionals rocmSupport ( 340 with rocmPackages; 341 [ 342 clr 343 rocthrust 344 rocprim 345 hipsparse 346 hipblas 347 ] 348 ) 349 ++ lib.optionals stdenv.cc.isClang [ 350 llvmPackages.openmp 351 ]; 352 353 dependencies = [ 354 aioprometheus 355 blake3 356 cachetools 357 depyf 358 fastapi 359 llguidance 360 lm-format-enforcer 361 numpy 362 openai 363 opencv-python-headless 364 outlines 365 pandas 366 prometheus-fastapi-instrumentator 367 py-cpuinfo 368 pyarrow 369 pydantic 370 python-json-logger 371 python-multipart 372 pyzmq 373 ray 374 sentencepiece 375 tiktoken 376 tokenizers 377 msgspec 378 gguf 379 einops 380 importlib-metadata 381 partial-json-parser 382 compressed-tensors 383 mistral-common 384 torch 385 torchaudio 386 torchvision 387 transformers 388 uvicorn 389 xformers 390 xgrammar 391 numba 392 opentelemetry-sdk 393 opentelemetry-api 394 opentelemetry-exporter-otlp 395 bitsandbytes 396 # vLLM needs Torch's compiler to be present in order to use torch.compile 397 torch.stdenv.cc 398 ] 399 ++ uvicorn.optional-dependencies.standard 400 ++ aioprometheus.optional-dependencies.starlette 401 ++ lib.optionals stdenv.targetPlatform.isLinux [ 402 py-libnuma 403 psutil 404 ] 405 ++ lib.optionals cudaSupport [ 406 cupy 407 pynvml 408 flashinfer 409 ]; 410 411 dontUseCmakeConfigure = true; 412 cmakeFlags = [ 413 ] 414 ++ lib.optionals cudaSupport [ 415 (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_CUTLASS" "${lib.getDev cutlass}") 416 (lib.cmakeFeature "FLASH_MLA_SRC_DIR" "${lib.getDev flashmla}") 417 (lib.cmakeFeature "VLLM_FLASH_ATTN_SRC_DIR" "${lib.getDev vllm-flash-attn'}") 418 (lib.cmakeFeature "TORCH_CUDA_ARCH_LIST" "${gpuTargetString}") 419 (lib.cmakeFeature "CUTLASS_NVCC_ARCHS_ENABLED" "${cudaPackages.flags.cmakeCudaArchitecturesString}") 420 (lib.cmakeFeature "CUDA_TOOLKIT_ROOT_DIR" "${symlinkJoin { 421 name = "cuda-merged-${cudaPackages.cudaMajorMinorVersion}"; 422 paths = builtins.concatMap getAllOutputs mergedCudaLibraries; 423 }}") 424 (lib.cmakeFeature "CAFFE2_USE_CUDNN" "ON") 425 (lib.cmakeFeature "CAFFE2_USE_CUFILE" "ON") 426 (lib.cmakeFeature "CUTLASS_ENABLE_CUBLAS" "ON") 427 ] 428 ++ lib.optionals cpuSupport [ 429 (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_ONEDNN" "${lib.getDev oneDNN}") 430 ]; 431 432 env = 433 lib.optionalAttrs cudaSupport { 434 VLLM_TARGET_DEVICE = "cuda"; 435 CUDA_HOME = "${lib.getDev cudaPackages.cuda_nvcc}"; 436 } 437 // lib.optionalAttrs rocmSupport { 438 VLLM_TARGET_DEVICE = "rocm"; 439 # Otherwise it tries to enumerate host supported ROCM gfx archs, and that is not possible due to sandboxing. 440 PYTORCH_ROCM_ARCH = lib.strings.concatStringsSep ";" rocmPackages.clr.gpuTargets; 441 ROCM_HOME = "${rocmPackages.clr}"; 442 } 443 // lib.optionalAttrs cpuSupport { 444 VLLM_TARGET_DEVICE = "cpu"; 445 }; 446 447 preConfigure = '' 448 # See: https://github.com/vllm-project/vllm/blob/v0.7.1/setup.py#L75-L109 449 # There's also NVCC_THREADS but Nix/Nixpkgs doesn't really have this concept. 450 export MAX_JOBS="$NIX_BUILD_CORES" 451 ''; 452 453 pythonRelaxDeps = true; 454 455 pythonImportsCheck = [ "vllm" ]; 456 457 passthru = { 458 # make internal dependency available to overlays 459 vllm-flash-attn = vllm-flash-attn'; 460 # updates the cutlass fetcher instead 461 skipBulkUpdate = true; 462 }; 463 464 meta = { 465 description = "High-throughput and memory-efficient inference and serving engine for LLMs"; 466 changelog = "https://github.com/vllm-project/vllm/releases/tag/v${version}"; 467 homepage = "https://github.com/vllm-project/vllm"; 468 license = lib.licenses.asl20; 469 maintainers = with lib.maintainers; [ 470 happysalada 471 lach 472 ]; 473 badPlatforms = [ 474 # CMake Error at cmake/cpu_extension.cmake:78 (find_isa): 475 # find_isa Function invoked with incorrect arguments for function named: 476 # find_isa 477 "x86_64-darwin" 478 ]; 479 }; 480}