Clone of https://github.com/NixOS/nixpkgs.git (to stress-test knotserver)
at master 13 kB view raw
1{ 2 lib, 3 stdenv, 4 python, 5 buildPythonPackage, 6 fetchFromGitHub, 7 fetchpatch, 8 symlinkJoin, 9 autoAddDriverRunpath, 10 11 # build system 12 cmake, 13 jinja2, 14 ninja, 15 packaging, 16 setuptools, 17 setuptools-scm, 18 19 # dependencies 20 which, 21 torch, 22 outlines, 23 psutil, 24 ray, 25 pandas, 26 pyarrow, 27 sentencepiece, 28 numpy, 29 transformers, 30 xformers, 31 xgrammar, 32 numba, 33 fastapi, 34 uvicorn, 35 pydantic, 36 aioprometheus, 37 pynvml, 38 openai, 39 pyzmq, 40 tiktoken, 41 torchaudio, 42 torchvision, 43 py-cpuinfo, 44 lm-format-enforcer, 45 prometheus-fastapi-instrumentator, 46 cupy, 47 cbor2, 48 pybase64, 49 gguf, 50 einops, 51 importlib-metadata, 52 partial-json-parser, 53 compressed-tensors, 54 mistral-common, 55 msgspec, 56 numactl, 57 tokenizers, 58 oneDNN, 59 blake3, 60 depyf, 61 opencv-python-headless, 62 cachetools, 63 llguidance, 64 python-json-logger, 65 python-multipart, 66 llvmPackages, 67 opentelemetry-sdk, 68 opentelemetry-api, 69 opentelemetry-exporter-otlp, 70 bitsandbytes, 71 flashinfer, 72 py-libnuma, 73 setproctitle, 74 openai-harmony, 75 76 # internal dependency - for overriding in overlays 77 vllm-flash-attn ? null, 78 79 cudaSupport ? torch.cudaSupport, 80 cudaPackages ? { }, 81 rocmSupport ? torch.rocmSupport, 82 rocmPackages ? { }, 83 gpuTargets ? [ ], 84}: 85 86let 87 inherit (lib) 88 lists 89 strings 90 trivial 91 ; 92 93 inherit (cudaPackages) flags; 94 95 shouldUsePkg = 96 pkg: if pkg != null && lib.meta.availableOn stdenv.hostPlatform pkg then pkg else null; 97 98 # see CMakeLists.txt, grepping for CUTLASS_REVISION 99 # https://github.com/vllm-project/vllm/blob/v${version}/CMakeLists.txt 100 cutlass = fetchFromGitHub { 101 owner = "NVIDIA"; 102 repo = "cutlass"; 103 tag = "v4.0.0"; 104 hash = "sha256-HJY+Go1viPkSVZPEs/NyMtYJzas4mMLiIZF3kNX+WgA="; 105 }; 106 107 flashmla = stdenv.mkDerivation { 108 pname = "flashmla"; 109 # https://github.com/vllm-project/FlashMLA/blob/${src.rev}/setup.py 110 version = "1.0.0"; 111 112 # grep for GIT_TAG in the following file 113 # https://github.com/vllm-project/vllm/blob/v${version}/cmake/external_projects/flashmla.cmake 114 src = fetchFromGitHub { 115 owner = "vllm-project"; 116 repo = "FlashMLA"; 117 rev = "5f65b85703c7ed75fda01e06495077caad207c3f"; 118 hash = "sha256-DO9EFNSoAgyfRRc095v1UjT+Zdzk4cFY0+n28FVEwI0="; 119 }; 120 121 dontConfigure = true; 122 123 # flashmla normally relies on `git submodule update` to fetch cutlass 124 buildPhase = '' 125 rm -rf csrc/cutlass 126 ln -sf ${cutlass} csrc/cutlass 127 ''; 128 129 installPhase = '' 130 cp -rva . $out 131 ''; 132 }; 133 134 vllm-flash-attn' = lib.defaultTo (stdenv.mkDerivation { 135 pname = "vllm-flash-attn"; 136 # https://github.com/vllm-project/flash-attention/blob/${src.rev}/vllm_flash_attn/__init__.py 137 version = "2.7.2.post1"; 138 139 # grep for GIT_TAG in the following file 140 # https://github.com/vllm-project/vllm/blob/v${version}/cmake/external_projects/vllm_flash_attn.cmake 141 src = fetchFromGitHub { 142 owner = "vllm-project"; 143 repo = "flash-attention"; 144 rev = "ee4d25bd84e0cbc7e0b9b9685085fd5db2dcb62a"; 145 hash = "sha256-2r0Habd/kBpvM4/aQFIYyj+uQAa3M9gjk3DcBZHFNfA="; 146 }; 147 148 patches = [ 149 # fix Hopper build failure 150 # https://github.com/Dao-AILab/flash-attention/pull/1719 151 # https://github.com/Dao-AILab/flash-attention/pull/1723 152 (fetchpatch { 153 url = "https://github.com/Dao-AILab/flash-attention/commit/dad67c88d4b6122c69d0bed1cebded0cded71cea.patch"; 154 hash = "sha256-JSgXWItOp5KRpFbTQj/cZk+Tqez+4mEz5kmH5EUeQN4="; 155 }) 156 (fetchpatch { 157 url = "https://github.com/Dao-AILab/flash-attention/commit/e26dd28e487117ee3e6bc4908682f41f31e6f83a.patch"; 158 hash = "sha256-NkCEowXSi+tiWu74Qt+VPKKavx0H9JeteovSJKToK9A="; 159 }) 160 ]; 161 162 dontConfigure = true; 163 164 # vllm-flash-attn normally relies on `git submodule update` to fetch cutlass and composable_kernel 165 buildPhase = '' 166 rm -rf csrc/cutlass 167 ln -sf ${cutlass} csrc/cutlass 168 '' 169 + lib.optionalString rocmSupport '' 170 rm -rf csrc/composable_kernel; 171 ln -sf ${rocmPackages.composable_kernel} csrc/composable_kernel 172 ''; 173 174 installPhase = '' 175 cp -rva . $out 176 ''; 177 }) vllm-flash-attn; 178 179 cpuSupport = !cudaSupport && !rocmSupport; 180 181 # https://github.com/pytorch/pytorch/blob/v2.8.0/torch/utils/cpp_extension.py#L2411-L2414 182 supportedTorchCudaCapabilities = 183 let 184 real = [ 185 "3.5" 186 "3.7" 187 "5.0" 188 "5.2" 189 "5.3" 190 "6.0" 191 "6.1" 192 "6.2" 193 "7.0" 194 "7.2" 195 "7.5" 196 "8.0" 197 "8.6" 198 "8.7" 199 "8.9" 200 "9.0" 201 "9.0a" 202 # Blackwell (SM100+) capabilities temporarily disabled due to CUTLASS API incompatibility 203 # FlashMLA kernels require CUTLASS v4.2.1+ APIs not available in bundled v4.0.0 204 # TODO: Re-enable when vLLM upgrades CUTLASS (see https://github.com/vllm-project/vllm/pull/24673) 205 # "10.0" 206 # "10.0a" 207 # "10.1" 208 # "10.1a" 209 # "10.3" 210 # "10.3a" 211 # "12.0" 212 # "12.0a" 213 # "12.1" 214 # "12.1a" 215 ]; 216 ptx = lists.map (x: "${x}+PTX") real; 217 in 218 real ++ ptx; 219 220 # NOTE: The lists.subtractLists function is perhaps a bit unintuitive. It subtracts the elements 221 # of the first list *from* the second list. That means: 222 # lists.subtractLists a b = b - a 223 224 # For CUDA 225 supportedCudaCapabilities = lists.intersectLists flags.cudaCapabilities supportedTorchCudaCapabilities; 226 unsupportedCudaCapabilities = lists.subtractLists supportedCudaCapabilities flags.cudaCapabilities; 227 228 isCudaJetson = cudaSupport && cudaPackages.flags.isJetsonBuild; 229 230 # Use trivial.warnIf to print a warning if any unsupported GPU targets are specified. 231 gpuArchWarner = 232 supported: unsupported: 233 trivial.throwIf (supported == [ ]) ( 234 "No supported GPU targets specified. Requested GPU targets: " 235 + strings.concatStringsSep ", " unsupported 236 ) supported; 237 238 # Create the gpuTargetString. 239 gpuTargetString = strings.concatStringsSep ";" ( 240 if gpuTargets != [ ] then 241 # If gpuTargets is specified, it always takes priority. 242 gpuTargets 243 else if cudaSupport then 244 gpuArchWarner supportedCudaCapabilities unsupportedCudaCapabilities 245 else if rocmSupport then 246 rocmPackages.clr.gpuTargets 247 else 248 throw "No GPU targets specified" 249 ); 250 251 mergedCudaLibraries = with cudaPackages; [ 252 cuda_cudart # cuda_runtime.h, -lcudart 253 cuda_cccl 254 libcurand # curand_kernel.h 255 libcusparse # cusparse.h 256 libcusolver # cusolverDn.h 257 cuda_nvtx 258 cuda_nvrtc 259 # cusparselt # cusparseLt.h 260 libcublas 261 ]; 262 263 # Some packages are not available on all platforms 264 nccl = shouldUsePkg (cudaPackages.nccl or null); 265 266 getAllOutputs = p: [ 267 (lib.getBin p) 268 (lib.getLib p) 269 (lib.getDev p) 270 ]; 271 272in 273 274buildPythonPackage rec { 275 pname = "vllm"; 276 version = "0.11.0"; 277 pyproject = true; 278 279 stdenv = torch.stdenv; 280 281 src = fetchFromGitHub { 282 owner = "vllm-project"; 283 repo = "vllm"; 284 tag = "v${version}"; 285 hash = "sha256-uYK/e9McEyrDTACMk5S0cGCjai9rf6HMR9dpPL7ISYc="; 286 }; 287 288 patches = [ 289 ./0002-setup.py-nix-support-respect-cmakeFlags.patch 290 ./0003-propagate-pythonpath.patch 291 ./0005-drop-intel-reqs.patch 292 # TODO: Remove the below patches when included in vLLM release 293 (fetchpatch { 294 url = "https://github.com/vllm-project/vllm/commit/9705fba7b727a3b9c275b012258608531e2223d1.patch"; 295 hash = "sha256-DxRGLiwkegMlMjqFmFc0igpaVv06/Y2WjL+ISoIOET4="; 296 }) 297 # patch above is previous commit needed to apply patch below 298 # oneDNN / CPU fix from https://github.com/vllm-project/vllm/pull/26401 299 (fetchpatch { 300 url = "https://github.com/vllm-project/vllm/commit/d7be1f2a480bdc62a6a1ec0126a401e3d42985fe.patch"; 301 hash = "sha256-Zi1k5wiOPjsbWHFKpcLq9Ns43wIP37Mbvesi5K80zaQ="; 302 }) 303 ]; 304 305 postPatch = '' 306 # pythonRelaxDeps does not cover build-system 307 substituteInPlace pyproject.toml \ 308 --replace-fail "torch ==" "torch >=" \ 309 --replace-fail "setuptools>=77.0.3,<80.0.0" "setuptools" 310 311 # Ignore the python version check because it hard-codes minor versions and 312 # lags behind `ray`'s python interpreter support 313 substituteInPlace CMakeLists.txt \ 314 --replace-fail \ 315 'set(PYTHON_SUPPORTED_VERSIONS' \ 316 'set(PYTHON_SUPPORTED_VERSIONS "${lib.versions.majorMinor python.version}"' 317 318 # Pass build environment PYTHONPATH to vLLM's Python configuration scripts 319 substituteInPlace CMakeLists.txt \ 320 --replace-fail '$PYTHONPATH' '$ENV{PYTHONPATH}' 321 ''; 322 323 nativeBuildInputs = [ 324 which 325 ] 326 ++ lib.optionals rocmSupport [ 327 rocmPackages.hipcc 328 ] 329 ++ lib.optionals cudaSupport [ 330 cudaPackages.cuda_nvcc 331 autoAddDriverRunpath 332 ] 333 ++ lib.optionals isCudaJetson [ 334 cudaPackages.autoAddCudaCompatRunpath 335 ]; 336 337 build-system = [ 338 cmake 339 jinja2 340 ninja 341 packaging 342 setuptools 343 setuptools-scm 344 torch 345 ]; 346 347 buildInputs = 348 lib.optionals cpuSupport [ 349 oneDNN 350 ] 351 ++ lib.optionals (cpuSupport && stdenv.hostPlatform.isLinux) [ 352 numactl 353 ] 354 ++ lib.optionals cudaSupport ( 355 mergedCudaLibraries 356 ++ (with cudaPackages; [ 357 nccl 358 cudnn 359 libcufile 360 ]) 361 ) 362 ++ lib.optionals rocmSupport ( 363 with rocmPackages; 364 [ 365 clr 366 rocthrust 367 rocprim 368 hipsparse 369 hipblas 370 ] 371 ) 372 ++ lib.optionals stdenv.cc.isClang [ 373 llvmPackages.openmp 374 ]; 375 376 dependencies = [ 377 aioprometheus 378 blake3 379 cachetools 380 cbor2 381 depyf 382 fastapi 383 llguidance 384 lm-format-enforcer 385 numpy 386 openai 387 opencv-python-headless 388 outlines 389 pandas 390 prometheus-fastapi-instrumentator 391 py-cpuinfo 392 pyarrow 393 pybase64 394 pydantic 395 python-json-logger 396 python-multipart 397 pyzmq 398 ray 399 sentencepiece 400 tiktoken 401 tokenizers 402 msgspec 403 gguf 404 einops 405 importlib-metadata 406 partial-json-parser 407 compressed-tensors 408 mistral-common 409 torch 410 torchaudio 411 torchvision 412 transformers 413 uvicorn 414 xformers 415 xgrammar 416 numba 417 opentelemetry-sdk 418 opentelemetry-api 419 opentelemetry-exporter-otlp 420 bitsandbytes 421 setproctitle 422 openai-harmony 423 # vLLM needs Torch's compiler to be present in order to use torch.compile 424 torch.stdenv.cc 425 ] 426 ++ uvicorn.optional-dependencies.standard 427 ++ aioprometheus.optional-dependencies.starlette 428 ++ lib.optionals stdenv.targetPlatform.isLinux [ 429 py-libnuma 430 psutil 431 ] 432 ++ lib.optionals cudaSupport [ 433 cupy 434 pynvml 435 flashinfer 436 ]; 437 438 dontUseCmakeConfigure = true; 439 cmakeFlags = [ 440 ] 441 ++ lib.optionals cudaSupport [ 442 (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_CUTLASS" "${lib.getDev cutlass}") 443 (lib.cmakeFeature "FLASH_MLA_SRC_DIR" "${lib.getDev flashmla}") 444 (lib.cmakeFeature "VLLM_FLASH_ATTN_SRC_DIR" "${lib.getDev vllm-flash-attn'}") 445 (lib.cmakeFeature "TORCH_CUDA_ARCH_LIST" "${gpuTargetString}") 446 (lib.cmakeFeature "CUTLASS_NVCC_ARCHS_ENABLED" "${cudaPackages.flags.cmakeCudaArchitecturesString}") 447 (lib.cmakeFeature "CUDA_TOOLKIT_ROOT_DIR" "${symlinkJoin { 448 name = "cuda-merged-${cudaPackages.cudaMajorMinorVersion}"; 449 paths = builtins.concatMap getAllOutputs mergedCudaLibraries; 450 }}") 451 (lib.cmakeFeature "CAFFE2_USE_CUDNN" "ON") 452 (lib.cmakeFeature "CAFFE2_USE_CUFILE" "ON") 453 (lib.cmakeFeature "CUTLASS_ENABLE_CUBLAS" "ON") 454 ]; 455 456 env = 457 lib.optionalAttrs cudaSupport { 458 VLLM_TARGET_DEVICE = "cuda"; 459 CUDA_HOME = "${lib.getDev cudaPackages.cuda_nvcc}"; 460 } 461 // lib.optionalAttrs rocmSupport { 462 VLLM_TARGET_DEVICE = "rocm"; 463 # Otherwise it tries to enumerate host supported ROCM gfx archs, and that is not possible due to sandboxing. 464 PYTORCH_ROCM_ARCH = lib.strings.concatStringsSep ";" rocmPackages.clr.gpuTargets; 465 ROCM_HOME = "${rocmPackages.clr}"; 466 } 467 // lib.optionalAttrs cpuSupport { 468 VLLM_TARGET_DEVICE = "cpu"; 469 FETCHCONTENT_SOURCE_DIR_ONEDNN = "${oneDNN.src}"; 470 }; 471 472 preConfigure = '' 473 # See: https://github.com/vllm-project/vllm/blob/v0.7.1/setup.py#L75-L109 474 # There's also NVCC_THREADS but Nix/Nixpkgs doesn't really have this concept. 475 export MAX_JOBS="$NIX_BUILD_CORES" 476 ''; 477 478 pythonRelaxDeps = true; 479 480 pythonImportsCheck = [ "vllm" ]; 481 482 passthru = { 483 # make internal dependency available to overlays 484 vllm-flash-attn = vllm-flash-attn'; 485 # updates the cutlass fetcher instead 486 skipBulkUpdate = true; 487 }; 488 489 meta = { 490 description = "High-throughput and memory-efficient inference and serving engine for LLMs"; 491 changelog = "https://github.com/vllm-project/vllm/releases/tag/v${version}"; 492 homepage = "https://github.com/vllm-project/vllm"; 493 license = lib.licenses.asl20; 494 maintainers = with lib.maintainers; [ 495 happysalada 496 lach 497 daniel-fahey 498 ]; 499 badPlatforms = [ 500 # CMake Error at cmake/cpu_extension.cmake:188 (message): 501 # vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA, ARMv8 or 502 # RISC-V support. 503 "aarch64-darwin" 504 505 # CMake Error at cmake/cpu_extension.cmake:78 (find_isa): 506 # find_isa Function invoked with incorrect arguments for function named: 507 # find_isa 508 "x86_64-darwin" 509 ]; 510 }; 511}